In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import plotly as pl 
import seaborn as sns 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Encoding Categorical Variables

#### There are two types of categorical data
##### 1: Nominal 
##### 2: Ordinal

### Label Encoding

##### In label encoding, each unique category value is assigned an integer based on alphabetical order or occurrence. It works well when the categorical data has an intrinsic order

##### Label encoding is helpful when you are working on output column when for example it give answere like either customer buy the product or not and result column contain values like 'yes' and 'No' 

In [5]:
from sklearn.preprocessing import LabelEncoder

data = ['Class A', 'Class B', 'Class C', 'Class A']
encoder = LabelEncoder()
encoded_data = encoder.fit_transform(data)

print(encoded_data)

[0 1 2 0]


### Ordinal Encoding
##### Ordinal encoding is useful when the categorical values have a clear, natural order. It maps each unique value to an integer but maintains the ordinal relationship.

In [17]:
from sklearn.preprocessing import OrdinalEncoder

data = pd.DataFrame({
    "Size": ["small", "medium", "Large", "medium", "Large", "small"],
    "temperature": ['low', 'medium', 'high', 'low', 'medium', 'high']
})

O_encoder = OrdinalEncoder(categories=[["small","medium","Large"], ['low','medium', "high"]])
O_encoder
O_encoder.fit(data)
encoded_data = O_encoder.transform(data)

print(encoded_data)

[[0. 0.]
 [1. 1.]
 [2. 2.]
 [1. 0.]
 [2. 1.]
 [0. 2.]]


In [18]:
df = sns.load_dataset("tips")
df.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [19]:
encoder = OrdinalEncoder()
dt_columns = df[["sex","smoker","day","time"]]
# dt_columns
encoded_data = encoder.fit_transform(df[["day"]])
print(encoded_data)

[[2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.

In [20]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoded_data = encoder.fit_transform(df[["time"]])
print(encoded_data)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


  y = column_or_1d(y, warn=True)


##### We need to train test siplit before encoding also for Ordinal Encoder you have to pass list like
##### oe = OrdinalEncoder(categrories=[['low','Average','good','Excellent']]) this will help full because you will
##### understande the data when it is in action that 0 for low and 1 for avergae and so on otherwise algoritham dicide by himself randomly
##### Also you can pass mulitple lists like
##### oe = OrdinalEncoder(categrories=[['low','Average','good','Excellent'],['school','college','university']]) this will help full because you will

## One-Hot Encoding

In [None]:
# One hot encoding using pandas get_dummies
dt = sns.load_dataset('titanic')
dt.sample(5)
dt.head(5)

# Applying One-Hot Encoding to 'sex', 'embarked', and 'class' columns
titanic_encoded = pd.get_dummies(dt, columns=['sex', 'embarked', 'class'])

titanic_encoded.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,who,adult_male,deck,embark_town,alive,alone,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third
0,0,3,22.0,1,0,7.25,man,True,,Southampton,no,False,False,True,False,False,True,False,False,True
1,1,1,38.0,1,0,71.2833,woman,False,C,Cherbourg,yes,False,True,False,True,False,False,True,False,False
2,1,3,26.0,0,0,7.925,woman,False,,Southampton,yes,True,True,False,False,False,True,False,False,True
3,1,1,35.0,1,0,53.1,woman,False,C,Southampton,yes,False,True,False,False,False,True,True,False,False
4,0,3,35.0,0,0,8.05,man,True,,Southampton,no,True,False,True,False,False,True,False,False,True


In [None]:
from sklearn.preprocessing import OneHotEncoder
df = sns.load_dataset('titanic')

df.sample(5)
cat_col = df[['sex', 'embarked', 'class']]
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(cat_col)
print(encoded_data)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2673 stored elements and shape (891, 9)>
  Coords	Values
  (0, 1)	1.0
  (0, 4)	1.0
  (0, 8)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 6)	1.0
  (2, 0)	1.0
  (2, 4)	1.0
  (2, 8)	1.0
  (3, 0)	1.0
  (3, 4)	1.0
  (3, 6)	1.0
  (4, 1)	1.0
  (4, 4)	1.0
  (4, 8)	1.0
  (5, 1)	1.0
  (5, 3)	1.0
  (5, 8)	1.0
  (6, 1)	1.0
  (6, 4)	1.0
  (6, 6)	1.0
  (7, 1)	1.0
  (7, 4)	1.0
  (7, 8)	1.0
  (8, 0)	1.0
  :	:
  (882, 8)	1.0
  (883, 1)	1.0
  (883, 4)	1.0
  (883, 7)	1.0
  (884, 1)	1.0
  (884, 4)	1.0
  (884, 8)	1.0
  (885, 0)	1.0
  (885, 3)	1.0
  (885, 8)	1.0
  (886, 1)	1.0
  (886, 4)	1.0
  (886, 7)	1.0
  (887, 0)	1.0
  (887, 4)	1.0
  (887, 6)	1.0
  (888, 0)	1.0
  (888, 4)	1.0
  (888, 8)	1.0
  (889, 1)	1.0
  (889, 2)	1.0
  (889, 6)	1.0
  (890, 1)	1.0
  (890, 3)	1.0
  (890, 8)	1.0


In [None]:
# Let's first train test the above code 
from sklearn.preprocessing import OneHotEncoder
df = sns.load_dataset('titanic')

df.sample(5)

# cat_col = df[['sex', 'embarked', 'class']]
X_train, X_test, y_train, y_test = train_test_split(cat_col.iloc[:,[0,1]],cat_col.iloc[:,-1], test_size=0.2, random_state=42)
X_train
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(cat_col)
print(encoded_data)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2673 stored elements and shape (891, 9)>
  Coords	Values
  (0, 1)	1.0
  (0, 4)	1.0
  (0, 8)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 6)	1.0
  (2, 0)	1.0
  (2, 4)	1.0
  (2, 8)	1.0
  (3, 0)	1.0
  (3, 4)	1.0
  (3, 6)	1.0
  (4, 1)	1.0
  (4, 4)	1.0
  (4, 8)	1.0
  (5, 1)	1.0
  (5, 3)	1.0
  (5, 8)	1.0
  (6, 1)	1.0
  (6, 4)	1.0
  (6, 6)	1.0
  (7, 1)	1.0
  (7, 4)	1.0
  (7, 8)	1.0
  (8, 0)	1.0
  :	:
  (882, 8)	1.0
  (883, 1)	1.0
  (883, 4)	1.0
  (883, 7)	1.0
  (884, 1)	1.0
  (884, 4)	1.0
  (884, 8)	1.0
  (885, 0)	1.0
  (885, 3)	1.0
  (885, 8)	1.0
  (886, 1)	1.0
  (886, 4)	1.0
  (886, 7)	1.0
  (887, 0)	1.0
  (887, 4)	1.0
  (887, 6)	1.0
  (888, 0)	1.0
  (888, 4)	1.0
  (888, 8)	1.0
  (889, 1)	1.0
  (889, 2)	1.0
  (889, 6)	1.0
  (890, 1)	1.0
  (890, 3)	1.0
  (890, 8)	1.0


#### In this encoding we carefully learn and work on multicoliniarity: where we remove one column because if we do not remove one column and we add all column after OHE the output will be 1 

## Column Transformer

In [None]:
dt = pd.read_csv("D:\\Machine learning\\DataSets\\covid_data_with_missing_values.csv")
dt.sample(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
450,37.0,Female,99.84364,Low,Chicago,Yes
490,45.0,Male,99.187489,Low,New York,No
261,46.0,Female,104.807592,Low,Houston,Yes
319,58.0,Male,97.746354,Low,Phoenix,Yes
73,24.0,Male,104.37656,High,Chicago,No


In [None]:
# dt.isnull().sum()
# dt.info()
# dt["cough"].value_counts()
dt["city"].value_counts()

city
Houston        108
Chicago        105
New York        98
Phoenix         96
Los Angeles     93
Name: count, dtype: int64

In [None]:
# let's fill missing values 
# Condsider this as dummy it will not effect the orignal outcome if we replace it with mean values
dt[dt["age"].isnull()]

# dt["age"].fillna(dt["age"].mean(),inplace=True)
# dt[dt["age"].isnull()]

# dt["fever"].fillna(dt["fever"].mean(),inplace=True)
# dt[dt["fever"].isnull()]


dt["age"] = dt["age"].astype("int")
dt["fever"] = dt["fever"].astype("int")

# dt["age"].dtype
dt.info()

In [None]:
# Train test siplit
X_train, X_test, y_train, y_test = train_test_split(dt.drop(columns=["has_covid"]), dt["has_covid"], test_size=0.2, random_state=42)
y_train

249    Yes
433    Yes
19     Yes
322     No
332     No
      ... 
106     No
270     No
348     No
435    Yes
102     No
Name: has_covid, Length: 400, dtype: object

### Without column transformer we have to transform each and every column separatly then concatinate

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


transformer = ColumnTransformer(transformers=[
    ('tranf1', SimpleImputer(),["age","fever"]),
    ('tranf2',OrdinalEncoder(categories=[["Low","High"]]),["cough"]),
    ('tranf3',OneHotEncoder(sparse_output=False,drop="first"),["gender","city"])],
                                remainder="passthrough")   # transformer get the list of columns need to tranform and reminder get the column other then that you can either 'drop' them or contain remain the same we will use term 'passthrough'

In [None]:
# transformer.fit_transform(X_train)
# transformer.fit_transform(X_train).shape

transformer.fit_transform(X_test)
# transformer.fit_transform(X_test).shape

array([[ 27., 102.,   1.,   1.,   1.,   0.,   0.,   0.],
       [ 24., 104.,   1.,   1.,   0.,   0.,   0.,   0.],
       [ 37., 101.,   1.,   1.,   1.,   0.,   0.,   0.],
       [ 63., 104.,   0.,   1.,   0.,   1.,   0.,   0.],
       [ 45.,  99.,   0.,   0.,   1.,   0.,   0.,   0.],
       [ 26., 104.,   0.,   1.,   0.,   1.,   0.,   0.],
       [ 65., 101.,   0.,   0.,   0.,   1.,   0.,   0.],
       [ 53.,  97.,   0.,   1.,   0.,   0.,   0.,   0.],
       [ 69., 100.,   0.,   1.,   0.,   1.,   0.,   0.],
       [ 37.,  99.,   0.,   0.,   0.,   0.,   0.,   0.],
       [ 46.,  99.,   0.,   0.,   0.,   0.,   1.,   0.],
       [ 42., 104.,   0.,   0.,   0.,   0.,   0.,   0.],
       [ 28.,  99.,   1.,   0.,   0.,   0.,   0.,   0.],
       [ 32.,  97.,   1.,   0.,   1.,   0.,   0.,   0.],
       [ 68., 103.,   0.,   0.,   0.,   1.,   0.,   0.],
       [ 19.,  97.,   0.,   1.,   0.,   0.,   1.,   0.],
       [ 27.,  97.,   1.,   0.,   0.,   1.,   0.,   0.],
       [ 50., 102.,   0.,   1.,

## Revision

In [None]:
Data = ["Yes","No","Yes","Yes","No","Yes","No","No"]
dt = pd.DataFrame(Data,columns=["Buyer"])
Encoder = LabelEncoder()
L_encoder = Encoder.fit_transform(dt["Buyer"])
# print(L_encoder)
dt["Buyer"] = L_encoder
dt

Unnamed: 0,Buyer
0,1
1,0
2,1
3,1
4,0
5,1
6,0
7,0


In [None]:
# OrdinalEncoding
Data = [["High"],["High"],["Medium"],["Low"],["Low"],["Low"],["Low"],["Medium"],["High"]]
dt = pd.DataFrame(Data,columns=["Fever"])
# dt_columns = dt[['Fever']]
Encoder = OrdinalEncoder(categories=[["Low","Medium","High"]])
O_encoder = Encoder.fit_transform(dt)
O_encoder

array([[2.],
       [2.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [2.]])

In [None]:
# OneHotEncoding
dt = sns.load_dataset("titanic")
# dt.sample(5)

dt_columns = ["sex","embarked","class"]
pd_ohe_encoder = pd.get_dummies(dt, columns=dt_columns)
pd_ohe_encoder

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,who,adult_male,deck,embark_town,alive,alone,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third
0,0,3,22.0,1,0,7.2500,man,True,,Southampton,no,False,False,True,False,False,True,False,False,True
1,1,1,38.0,1,0,71.2833,woman,False,C,Cherbourg,yes,False,True,False,True,False,False,True,False,False
2,1,3,26.0,0,0,7.9250,woman,False,,Southampton,yes,True,True,False,False,False,True,False,False,True
3,1,1,35.0,1,0,53.1000,woman,False,C,Southampton,yes,False,True,False,False,False,True,True,False,False
4,0,3,35.0,0,0,8.0500,man,True,,Southampton,no,True,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,man,True,,Southampton,no,True,False,True,False,False,True,False,True,False
887,1,1,19.0,0,0,30.0000,woman,False,B,Southampton,yes,True,True,False,False,False,True,True,False,False
888,0,3,,1,2,23.4500,woman,False,,Southampton,no,False,True,False,False,False,True,False,False,True
889,1,1,26.0,0,0,30.0000,man,True,C,Cherbourg,yes,True,False,True,True,False,False,True,False,False


In [None]:
# dt_columns = [["sex","embarked","class"]]
# Encoder = OneHotEncoder()
# ohe_encoder = Encoder.fit_transform(dt_columns)
# print(ohe_encoder)

df = sns.load_dataset('titanic')
cat_col = df[['sex', 'embarked', 'class']]
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(cat_col)
print(encoded_data)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2673 stored elements and shape (891, 9)>
  Coords	Values
  (0, 1)	1.0
  (0, 4)	1.0
  (0, 8)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 6)	1.0
  (2, 0)	1.0
  (2, 4)	1.0
  (2, 8)	1.0
  (3, 0)	1.0
  (3, 4)	1.0
  (3, 6)	1.0
  (4, 1)	1.0
  (4, 4)	1.0
  (4, 8)	1.0
  (5, 1)	1.0
  (5, 3)	1.0
  (5, 8)	1.0
  (6, 1)	1.0
  (6, 4)	1.0
  (6, 6)	1.0
  (7, 1)	1.0
  (7, 4)	1.0
  (7, 8)	1.0
  (8, 0)	1.0
  :	:
  (882, 8)	1.0
  (883, 1)	1.0
  (883, 4)	1.0
  (883, 7)	1.0
  (884, 1)	1.0
  (884, 4)	1.0
  (884, 8)	1.0
  (885, 0)	1.0
  (885, 3)	1.0
  (885, 8)	1.0
  (886, 1)	1.0
  (886, 4)	1.0
  (886, 7)	1.0
  (887, 0)	1.0
  (887, 4)	1.0
  (887, 6)	1.0
  (888, 0)	1.0
  (888, 4)	1.0
  (888, 8)	1.0
  (889, 1)	1.0
  (889, 2)	1.0
  (889, 6)	1.0
  (890, 1)	1.0
  (890, 3)	1.0
  (890, 8)	1.0


In [None]:
# Column Transformer
dt = pd.read_csv("D:\\Machine learning\\DataSets\\covid_data_with_missing_values.csv")
# dt.sample(5)

# let's fill missing values 
# Condsider this as dummy it will not effect the orignal outcome if we replace it with mean values
dt[dt["age"].isnull()]

dt["age"].fillna(dt["age"].mean(),inplace=True)
dt[dt["age"].isnull()]

dt["fever"].fillna(dt["fever"].mean(),inplace=True)
dt[dt["fever"].isnull()]


dt["age"] = dt["age"].astype("int")
dt["fever"] = dt["fever"].astype("int")

# dt["age"].dtype
dt["age"].isnull().sum()
# dt.info()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dt["age"].fillna(dt["age"].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dt["fever"].fillna(dt["fever"].mean(),inplace=True)


np.int64(0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dt.drop(columns=["has_covid"]), dt["has_covid"], test_size=0.2, random_state=42)
X_train.sample(5)

Unnamed: 0,age,gender,fever,cough,city
24,37,Male,97,Low,Houston
122,32,Male,102,Low,New York
402,45,Male,100,Low,New York
306,56,Male,99,Low,New York
74,61,Female,102,Low,New York


In [None]:
Encoder = OneHotEncoder()