In [2]:
import pandas as pd

data={
    "name":['pavan', 'kapil', 'lalit', 'ishan', 'krish'],
    "age":[25, None, 44, 23, None],
    "salary": [50000, 60000, 70000, None, None]
}
df=pd.DataFrame(data)
print(df)

    name   age   salary
0  pavan  25.0  50000.0
1  kapil   NaN  60000.0
2  lalit  44.0  70000.0
3  ishan  23.0      NaN
4  krish   NaN      NaN


In [15]:
#handling missing data

print(df.isnull().sum())
print(df[df['age'].isnull()])

df_drop=df.dropna()
print(df_drop)

df['age'].fillna(df['age'].mean(), inplace=True)
df['salary'].fillna(df['salary'].mean(), inplace=True)

print(df)

print(df.isnull().mean() * 100) # it will give u the percentage of data missing for better understanding

name      0
age       0
salary    0
dtype: int64
Empty DataFrame
Columns: [name, age, salary]
Index: []
    name        age   salary
0  pavan  25.000000  50000.0
1  kapil  30.666667  60000.0
2  lalit  44.000000  70000.0
3  ishan  23.000000  60000.0
4  krish  30.666667  60000.0
    name        age   salary
0  pavan  25.000000  50000.0
1  kapil  30.666667  60000.0
2  lalit  44.000000  70000.0
3  ishan  23.000000  60000.0
4  krish  30.666667  60000.0
name      0.0
age       0.0
salary    0.0
dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['salary'].fillna(df['salary'].mean(), inplace=True)


In [13]:
#Machine Learning only understand numerical data, so we need to convert categorical data into numerical data
#label encoding
#sklearn.preprocessing= contains functions to scale/normalize your data

from sklearn.preprocessing import LabelEncoder
import pandas as pd

df=pd.read_csv("sample_data.csv")
df_label=df.copy()

le=LabelEncoder() # This will convert categorical data into numerical data
df_label['gender_encoded']= le.fit_transform(df_label['gender']) # passing the actual column to be encoded
df_label['passed_encoded']= le.fit_transform(df_label['passed'])

print("\nlabel encoded data: ")
print(df_label[['name', 'gender', 'gender_encoded', 'passed', 'passed_encoded']]) 

#the above we have passed only the columns we want to see, but we can also pass the whole dataframe
# print(df_label) # we can pass like that to see the whole dataframe with encoded columns


label encoded data: 
      name  gender  gender_encoded passed  passed_encoded
0     aman    male               1    yes               1
1    priya  female               0    yes               1
2    rahul    male               1     no               0
3   anjali  female               0    yes               1
4     ravi    male               1    yes               1
5    meera  female               0     no               0
6    arjun    male               1    yes               1
7     neha  female               0    yes               1
8    imran    male               1     no               0
9    sneha  female               0    yes               1
10     raj    male               1    yes               1
11   divya  female               0     no               0
12   kabir    male               1    yes               1
13  simran  female               0    yes               1
14   karan    male               1     no               0
15   pooja  female               0    yes         

In [11]:
#we connot use label encoding for city, bcoz it only works for categorical data like (male, female, yes, no)
# for city we can use one hot encoding

df_encoded=pd.get_dummies(df_label, columns=['city'], dtype=int) # using dtype to convert boolean value into integer
print("\none-hot encoded data (city): ")
print(df_encoded)


one-hot encoded data (city): 
      name  gender passed  gender_encoded  passed_encoded  city_bangalore  \
0     aman    male    yes               1               1               0   
1    priya  female    yes               0               1               0   
2    rahul    male     no               1               0               1   
3   anjali  female    yes               0               1               0   
4     ravi    male    yes               1               1               0   
5    meera  female     no               0               0               0   
6    arjun    male    yes               1               1               1   
7     neha  female    yes               0               1               0   
8    imran    male     no               1               0               0   
9    sneha  female    yes               0               1               0   
10     raj    male    yes               1               1               0   
11   divya  female     no               0    

In [1]:
#feature scaling
#standard scaler= scales feature to have mean=0 and std=1
#minmax scaler= scales feature to a range between 0 and 1

from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler= StandardScaler()
x_scaled=scaler.fit_transform()

scaler=MinMaxScaler()
x_scaled=scaler.fit_transform()

TypeError: TransformerMixin.fit_transform() missing 1 required positional argument: 'X'

In [18]:
#train test split= splits the data into training and testing data

import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

data={
    'studyhours':[1,2,3,4,5],
    'testscore':[40,50,60,70,80]
}

df=pd.DataFrame(data)

#standard scaler
standard_scaler=StandardScaler()
standard_scaled=standard_scaler.fit_transform(df)

print("\nStandard Scaler output:")
print(pd.DataFrame(standard_scaled, columns=['studyhours', 'testscore']))

#minmax scaler
minmax_scaler=MinMaxScaler()
minmax_scaled=minmax_scaler.fit_transform(df)

print("\nMinMax Scaled output:")
print(pd.DataFrame(minmax_scaled, columns=['studyhours', 'testscore']))

#train test split
X=df[['studyhours']] # independent variable 
y=df[['testscore']] 

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)
print("training data:")
print(X_train)

print("test data:")
print(X_test)

print("training data:")
print(y_train)

print("test data:")
print(y_test)


Standard Scaler output:
   studyhours  testscore
0   -1.414214  -1.414214
1   -0.707107  -0.707107
2    0.000000   0.000000
3    0.707107   0.707107
4    1.414214   1.414214

MinMax Scaled output:
   studyhours  testscore
0        0.00       0.00
1        0.25       0.25
2        0.50       0.50
3        0.75       0.75
4        1.00       1.00
training data:
   studyhours
4           5
2           3
0           1
3           4
test data:
   studyhours
1           2
training data:
   testscore
4         80
2         60
0         40
3         70
test data:
   testscore
1         50
