## 1. Data Import and Cleaning

In [1]:

import pandas as pd

df = pd.read_csv(r"C:\Users\syeda\Downloads\sample_data.csv")

# Drop rows with missing values
df_dropna = df.dropna()

# Replace missing numerical values with mean
df_fill_mean = df.copy()
df_fill_mean['C'] = df_fill_mean['C'].fillna(df_fill_mean['C'].mean())

# Replace missing categorical values with mode
df_fill_mode = df.copy()
df_fill_mode['E'] = df_fill_mode['E'].fillna(df_fill_mode['E'].mode()[0])

df_dropna, df_fill_mean, df_fill_mode


  from pandas.core import (


(    A   B     C    D      E        Date
 1  93  24  64.0  dog  apple  2023-01-02
 2  15   3  60.0  cat  apple  2023-01-03
 5  21   2  33.0  dog  apple  2023-01-06
 7  87  30  76.0  dog  apple  2023-01-08,
     A   B     C    D      E        Date
 0  52  88  52.0  cat    NaN  2023-01-01
 1  93  24  64.0  dog  apple  2023-01-02
 2  15   3  60.0  cat  apple  2023-01-03
 3  72  22  52.0  dog  apple  2023-01-04
 4  61  53  21.0  cat    NaN  2023-01-05
 5  21   2  33.0  dog  apple  2023-01-06
 6  83  88  52.0  cat  apple  2023-01-07
 7  87  30  76.0  dog  apple  2023-01-08
 8  75  38  58.0  cat    NaN  2023-01-09
 9  75   2  52.0  dog  apple  2023-01-10,
     A   B     C    D      E        Date
 0  52  88   NaN  cat  apple  2023-01-01
 1  93  24  64.0  dog  apple  2023-01-02
 2  15   3  60.0  cat  apple  2023-01-03
 3  72  22   NaN  dog  apple  2023-01-04
 4  61  53  21.0  cat  apple  2023-01-05
 5  21   2  33.0  dog  apple  2023-01-06
 6  83  88   NaN  cat  apple  2023-01-07
 7  87  30  76

## 2. Data Transformation

In [2]:

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df['A_plus_B'] = np.add(df['A'], df['B'])
df['sqrt_A'] = np.sqrt(df['A'])

scaler = MinMaxScaler()
df['A_normalized'] = scaler.fit_transform(df[['A']])

df[['A', 'B', 'A_plus_B', 'sqrt_A', 'A_normalized']]


Unnamed: 0,A,B,A_plus_B,sqrt_A,A_normalized
0,52,88,140,7.211103,0.474359
1,93,24,117,9.643651,1.0
2,15,3,18,3.872983,0.0
3,72,22,94,8.485281,0.730769
4,61,53,114,7.81025,0.589744
5,21,2,23,4.582576,0.076923
6,83,88,171,9.110434,0.871795
7,87,30,117,9.327379,0.923077
8,75,38,113,8.660254,0.769231
9,75,2,77,8.660254,0.769231


## 3. Merging and Joining Datasets

In [3]:

df2 = df[['A', 'B']].copy()
df2['key'] = list(range(10))
df['key'] = list(range(10))

merged_df = pd.merge(df, df2, on='key', how='inner').fillna(0)

df_left = pd.DataFrame({'id': [1, 2], 'val1': ['x', 'y']})
df_right = pd.DataFrame({'ref': [2, 3], 'val2': ['a', 'b']})
left_join = pd.merge(df_left, df_right, left_on='id', right_on='ref', how='left')

df_concat = pd.concat([df, df2], axis=1)
df_concat = df_concat.loc[:, ~df_concat.columns.duplicated()]

merged_df, left_join, df_concat


(   A_x  B_x     C    D      E        Date  A_plus_B    sqrt_A  A_normalized  \
 0   52   88   0.0  cat      0  2023-01-01       140  7.211103      0.474359   
 1   93   24  64.0  dog  apple  2023-01-02       117  9.643651      1.000000   
 2   15    3  60.0  cat  apple  2023-01-03        18  3.872983      0.000000   
 3   72   22   0.0  dog  apple  2023-01-04        94  8.485281      0.730769   
 4   61   53  21.0  cat      0  2023-01-05       114  7.810250      0.589744   
 5   21    2  33.0  dog  apple  2023-01-06        23  4.582576      0.076923   
 6   83   88   0.0  cat  apple  2023-01-07       171  9.110434      0.871795   
 7   87   30  76.0  dog  apple  2023-01-08       117  9.327379      0.923077   
 8   75   38  58.0  cat      0  2023-01-09       113  8.660254      0.769231   
 9   75    2   0.0  dog  apple  2023-01-10        77  8.660254      0.769231   
 
    key  A_y  B_y  
 0    0   52   88  
 1    1   93   24  
 2    2   15    3  
 3    3   72   22  
 4    4   61   53 

## 4. Grouping and Aggregation

In [4]:

grouped = df.groupby('D')['A'].agg(['mean', 'std'])

sum_grouped = df.groupby('D')['A'].sum()
sum_grouped_np = np.sqrt(sum_grouped)

pivot_table = df.pivot_table(values='A', index='D', columns='E', aggfunc=np.mean)

grouped, sum_grouped_np, pivot_table


  pivot_table = df.pivot_table(values='A', index='D', columns='E', aggfunc=np.mean)


(     mean        std
 D                   
 cat  57.2  26.480181
 dog  69.6  28.492104,
 D
 cat    16.911535
 dog    18.654758
 Name: A, dtype: float64,
 E    apple
 D         
 cat   49.0
 dog   69.6)

## 5. Array Operations and Manipulation

In [5]:

arr = df['A'].values
arr_squared = arr ** 2

reshaped_arr = arr.reshape(-1, 1)
df['reshaped_A'] = reshaped_arr

filtered_df = df[df['A'] > 50]

arr_squared, reshaped_arr, filtered_df


(array([2704, 8649,  225, 5184, 3721,  441, 6889, 7569, 5625, 5625],
       dtype=int64),
 array([[52],
        [93],
        [15],
        [72],
        [61],
        [21],
        [83],
        [87],
        [75],
        [75]], dtype=int64),
     A   B     C    D      E        Date  A_plus_B    sqrt_A  A_normalized  \
 0  52  88   NaN  cat    NaN  2023-01-01       140  7.211103      0.474359   
 1  93  24  64.0  dog  apple  2023-01-02       117  9.643651      1.000000   
 3  72  22   NaN  dog  apple  2023-01-04        94  8.485281      0.730769   
 4  61  53  21.0  cat    NaN  2023-01-05       114  7.810250      0.589744   
 6  83  88   NaN  cat  apple  2023-01-07       171  9.110434      0.871795   
 7  87  30  76.0  dog  apple  2023-01-08       117  9.327379      0.923077   
 8  75  38  58.0  cat    NaN  2023-01-09       113  8.660254      0.769231   
 9  75   2   NaN  dog  apple  2023-01-10        77  8.660254      0.769231   
 
    key  reshaped_A  
 0    0          52  
 1    1

## 6. Broadcasting and Vectorized Operations

In [6]:

broadcast_array = np.array([1])
df['broadcasted'] = df['A'] + broadcast_array[0]

df['multi_column_sum'] = np.add(df['A'], df['B'])

row_means = df.iloc[:, 0:2].mean(axis=1)
df_broadcast = df.iloc[:, 0:2].subtract(row_means, axis=0)

df[['broadcasted', 'multi_column_sum']], df_broadcast


(   broadcasted  multi_column_sum
 0           53               140
 1           94               117
 2           16                18
 3           73                94
 4           62               114
 5           22                23
 6           84               171
 7           88               117
 8           76               113
 9           76                77,
       A     B
 0 -18.0  18.0
 1  34.5 -34.5
 2   6.0  -6.0
 3  25.0 -25.0
 4   4.0  -4.0
 5   9.5  -9.5
 6  -2.5   2.5
 7  28.5 -28.5
 8  18.5 -18.5
 9  36.5 -36.5)

## 7. Linear Algebra with NumPy

In [7]:

# Solve system: 2x + y = 8; 3x + 2y = 13
A_sys = np.array([[2, 1], [3, 2]])
b_sys = np.array([8, 13])
solution = np.linalg.solve(A_sys, b_sys)

df_dot = np.dot(df['A'], df['B'])

matrix_mult = pd.DataFrame(np.dot(df[['A', 'B']].values.T, df[['A', 'B']].values))

solution, df_dot, matrix_mult


(array([3., 2.]),
 24626,
        0      1
 0  46632  24626
 1  24626  21718)

## 8. Handling Missing Data

In [8]:

df_interp = df.copy()
df_interp['C'] = df_interp['C'].interpolate(method='linear')

mask = df['C'].isna()
df_masked = df.copy()
df_masked['C'][mask] = -1

q1 = df['C'].quantile(0.25)
q3 = df['C'].quantile(0.75)
iqr = q3 - q1
outlier_mask = (df['C'] < (q1 - 1.5 * iqr)) | (df['C'] > (q3 + 1.5 * iqr))
df_outliers_handled = df.copy()
df_outliers_handled.loc[outlier_mask, 'C'] = df['C'].median()

df_interp, df_masked, df_outliers_handled


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_masked['C'][mask] = -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_masked['C'][mask] = -1


(    A   B     C    D      E        Date  A_plus_B    sqrt_A  A_normalized  \
 0  52  88   NaN  cat    NaN  2023-01-01       140  7.211103      0.474359   
 1  93  24  64.0  dog  apple  2023-01-02       117  9.643651      1.000000   
 2  15   3  60.0  cat  apple  2023-01-03        18  3.872983      0.000000   
 3  72  22  40.5  dog  apple  2023-01-04        94  8.485281      0.730769   
 4  61  53  21.0  cat    NaN  2023-01-05       114  7.810250      0.589744   
 5  21   2  33.0  dog  apple  2023-01-06        23  4.582576      0.076923   
 6  83  88  54.5  cat  apple  2023-01-07       171  9.110434      0.871795   
 7  87  30  76.0  dog  apple  2023-01-08       117  9.327379      0.923077   
 8  75  38  58.0  cat    NaN  2023-01-09       113  8.660254      0.769231   
 9  75   2  58.0  dog  apple  2023-01-10        77  8.660254      0.769231   
 
    key  reshaped_A  broadcasted  multi_column_sum  
 0    0          52           53               140  
 1    1          93           94  

## 9. Advanced Data Analysis

In [9]:

trend_df = df.groupby(['D', 'E'])['A'].mean().unstack()

correlation = df[['A', 'B', 'C']].corr()

df['rolling_mean'] = df['A'].rolling(window=3).mean()
df[['Date', 'A', 'rolling_mean']]


Unnamed: 0,Date,A,rolling_mean
0,2023-01-01,52,
1,2023-01-02,93,
2,2023-01-03,15,53.333333
3,2023-01-04,72,60.0
4,2023-01-05,61,49.333333
5,2023-01-06,21,51.333333
6,2023-01-07,83,55.0
7,2023-01-08,87,63.666667
8,2023-01-09,75,81.666667
9,2023-01-10,75,79.0


## 10. DataFrame and Array Manipulation

In [13]:

import pandas as pd
import numpy as np

# Load the DataFrame
df = pd.read_csv("C:/Users/syeda/Downloads/sample_data.csv")

# ✅ Step 1: Only apply numeric operation to numeric columns
numeric_df = df.select_dtypes(include=[np.number])
arr = numeric_df.values
arr += 1
df_transformed = pd.DataFrame(arr, columns=numeric_df.columns)

# ✅ Step 2: Filter random DataFrame
random_df = pd.DataFrame(np.random.rand(10, 3), columns=['X', 'Y', 'Z'])
filtered_random_df = random_df[(random_df['X'] > 0.5) & (random_df['Y'] < 0.5)]

# ✅ Step 3: Custom feature from A and B
arr1 = np.array(df['A'], dtype=float)
arr2 = np.array(df['B'], dtype=float)
df_custom = pd.DataFrame(np.power(arr1, 2) + np.power(arr2, 2), columns=['custom_sum'])

# ✅ Display all three outputs
print("✅ Transformed Numeric DataFrame:")
print(df_transformed)

print("\n✅ Filtered Random DataFrame:")
print(filtered_random_df)

print("\n✅ Custom Feature DataFrame:")
print(df_custom)



✅ Transformed Numeric DataFrame:
      A     B     C
0  53.0  89.0   NaN
1  94.0  25.0  65.0
2  16.0   4.0  61.0
3  73.0  23.0   NaN
4  62.0  54.0  22.0
5  22.0   3.0  34.0
6  84.0  89.0   NaN
7  88.0  31.0  77.0
8  76.0  39.0  59.0
9  76.0   3.0   NaN

✅ Filtered Random DataFrame:
          X         Y         Z
7  0.816023  0.233229  0.529765
9  0.642664  0.475739  0.074028

✅ Custom Feature DataFrame:
   custom_sum
0     10448.0
1      9225.0
2       234.0
3      5668.0
4      6530.0
5       445.0
6     14633.0
7      8469.0
8      7069.0
9      5629.0


## 11. Data Reshaping and Analysis

In [11]:

reshaped = df['A'].values.reshape(2, 5)

df1 = df.head(5)
df2 = df.tail(5)
stacked = pd.concat([df1, df2], axis=0)

array3d = np.random.randint(1, 10, size=(2, 3, 3))
df_3d = pd.DataFrame(array3d.reshape(6, 3))
df_grouped_3d = df_3d.groupby(df_3d.index % 2).sum()

reshaped, stacked, df_grouped_3d


(array([[52, 93, 15, 72, 61],
        [21, 83, 87, 75, 75]], dtype=int64),
     A   B     C    D      E        Date  A_plus_B    sqrt_A  A_normalized  \
 0  52  88   NaN  cat    NaN  2023-01-01       140  7.211103      0.474359   
 1  93  24  64.0  dog  apple  2023-01-02       117  9.643651      1.000000   
 2  15   3  60.0  cat  apple  2023-01-03        18  3.872983      0.000000   
 3  72  22   NaN  dog  apple  2023-01-04        94  8.485281      0.730769   
 4  61  53  21.0  cat    NaN  2023-01-05       114  7.810250      0.589744   
 5  21   2  33.0  dog  apple  2023-01-06        23  4.582576      0.076923   
 6  83  88   NaN  cat  apple  2023-01-07       171  9.110434      0.871795   
 7  87  30  76.0  dog  apple  2023-01-08       117  9.327379      0.923077   
 8  75  38  58.0  cat    NaN  2023-01-09       113  8.660254      0.769231   
 9  75   2   NaN  dog  apple  2023-01-10        77  8.660254      0.769231   
 
    key  reshaped_A  broadcasted  multi_column_sum  rolling_mean 

## 12. Time Series Analysis

In [12]:

df['Date'] = pd.to_datetime(df['Date'])
df['days_diff'] = (df['Date'] - df['Date'].shift()).dt.days.fillna(0)

df['moving_avg'] = df['A'].rolling(window=3).mean()

datetime_diff = df['Date'].diff().dt.days

df[['Date', 'A', 'moving_avg', 'days_diff']], datetime_diff


(        Date   A  moving_avg  days_diff
 0 2023-01-01  52         NaN        0.0
 1 2023-01-02  93         NaN        1.0
 2 2023-01-03  15   53.333333        1.0
 3 2023-01-04  72   60.000000        1.0
 4 2023-01-05  61   49.333333        1.0
 5 2023-01-06  21   51.333333        1.0
 6 2023-01-07  83   55.000000        1.0
 7 2023-01-08  87   63.666667        1.0
 8 2023-01-09  75   81.666667        1.0
 9 2023-01-10  75   79.000000        1.0,
 0    NaN
 1    1.0
 2    1.0
 3    1.0
 4    1.0
 5    1.0
 6    1.0
 7    1.0
 8    1.0
 9    1.0
 Name: Date, dtype: float64)