In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('covid_toy.csv')

In [5]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [11]:
# 1. Train Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['has_covid']), df['has_covid'], test_size=0.2, random_state=0)

X_train.shape, X_test.shape

KeyError: "['has_covid'] not found in axis"

In [11]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
43,22,Female,99.0,Mild,Bangalore
62,56,Female,104.0,Strong,Bangalore
3,31,Female,98.0,Mild,Kolkata
71,75,Female,104.0,Strong,Delhi
45,72,Male,99.0,Mild,Bangalore


In [13]:
X_test.head()

Unnamed: 0,age,gender,fever,cough,city
26,19,Female,100.0,Mild,Kolkata
86,25,Male,104.0,Mild,Bangalore
2,42,Male,101.0,Mild,Delhi
55,81,Female,101.0,Mild,Mumbai
75,5,Male,102.0,Mild,Kolkata


In [25]:
# 2. Handling NULL values in fever column
from sklearn.impute import SimpleImputer

si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])

X_test_fever = si.transform(X_test[['fever']])

In [61]:
X_train[['city']].value_counts()

city     
Bangalore    25
Kolkata      23
Delhi        19
Mumbai       13
Name: count, dtype: int64

In [41]:
X_train_fever[1:10]

array([[104.],
       [ 98.],
       [104.],
       [ 99.],
       [ 99.],
       [101.],
       [ 98.],
       [ 98.],
       [100.]])

In [49]:
# 3. OneHotEncoding on city and gender columns
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, drop='first' )

X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

X_test_gender_city = ohe.transform(X_test[['gender','city']])

In [63]:
X_train_gender_city[:5]

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])

In [65]:
X_train.shape

(80, 5)

In [67]:
X_test.shape

(20, 5)

In [83]:
# 4. OrdinalEncoder on cough column
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Mild','Strong']])

In [85]:
X_train_cough = oe.fit_transform(X_train[['cough']])

X_test_cough = oe.transform(X_test[['cough']])

In [134]:
X_train_age =X_train.drop(columns=['fever','gender','city','cough']).values
X_test_age = X_test.drop(columns=['fever','gender','city','cough']).values

In [136]:
X_train_age.shape

(80, 1)

In [138]:
X_train_transformed = np.concatenate((X_train_age, X_train_fever, X_train_gender_city, X_train_cough), axis=1)

X_test_transformed = np.concatenate((X_test_age, X_test_fever, X_test_gender_city, X_test_cough), axis=1)

X_train_transformed.shape

(80, 7)

In [142]:
X_train_transformed.shape

(80, 7)

In [156]:
# Transformer
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('tnf1', SimpleImputer(), ['fever']),
    ('tnf2', OrdinalEncoder(categories=[['Mild','Strong']]), ['cough']),
    ('tnf3', OneHotEncoder(drop='first', sparse_output=False), ['gender','city'])
], remainder='passthrough')

In [160]:
transformer.fit_transform(X_train).shape

(80, 7)

In [162]:
transformer.transform(X_test).shape

(20, 7)

# Practice

In [69]:
df = pd.read_csv('customer.csv')

In [71]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [73]:
# 1. Train Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['purchased']), df['purchased'], test_size=0.2, random_state=0)

X_train.shape, X_test.shape

((40, 4), (10, 4))

In [75]:
# 2. Extracting 'age' column

X_train_age = X_train.drop(columns=['gender','review','education']).values

X_test_age = X_test.drop(columns=['gender','review','education']).values

In [17]:
# Applyiing OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first', sparse_output=False)

X_train_gender = ohe.fit_transform(X_train[['gender']])
X_test_gender = ohe.transform(X_test[['gender']])

In [19]:
X_train_gender.shape

(40, 1)

In [21]:
# Applying OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Poor','Good','Average'],['School','UG','PG']])

In [23]:
X_train_review_education = oe.fit_transform(X_train[['review','education']])

X_test_review_education = oe.transform(X_test[['review','education']])

In [25]:
X_train_review_education.shape

(40, 2)

In [27]:
# Combining
X_train_transformed = np.concatenate((X_train_age, X_train_gender, X_train_review_education), axis=1)

X_test_transformed = np.concatenate((X_test_age, X_test_gender, X_test_review_education), axis=1)

In [29]:
X_train_transformed.shape

(40, 4)

In [97]:
# Transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

transformer = ColumnTransformer(transformers=[
    ('tnf1', OneHotEncoder(drop='first', sparse_output=False), ['gender']),
    ('tnf2', OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']]), ['review','education'])
], remainder='passthrough')

In [99]:
X_train.head()

Unnamed: 0,age,gender,review,education
33,89,Female,Good,PG
35,74,Male,Poor,School
26,53,Female,Poor,PG
34,86,Male,Average,School
18,19,Male,Good,School


In [101]:
X_train_transformed = transformer.fit_transform(X_train)

In [103]:
X_test_transformed = transformer.transform(X_test)

In [109]:
X_train_transformed.shape

(40, 4)

# Practice 2

In [154]:
df = pd.read_csv('Bankchurners.csv', index_col='CLIENTNUM', usecols=['CLIENTNUM','Attrition_Flag','Customer_Age','Gender','Education_Level','Marital_Status','Card_Category'],  nrows=200)

In [156]:
df.sample(3)

Unnamed: 0_level_0,Attrition_Flag,Customer_Age,Gender,Education_Level,Marital_Status,Card_Category
CLIENTNUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
720336708,Existing Customer,53,M,Doctorate,Married,Blue
802013583,Existing Customer,56,M,College,Married,Blue
783554958,Existing Customer,58,M,Graduate,Single,Blue


In [158]:
df.shape

(200, 6)

In [165]:
df['Education_Level'].value_counts()

Education_Level
Graduate         67
High School      40
Uneducated       31
Unknown          26
College          17
Doctorate        11
Post-Graduate     8
Name: count, dtype: int64

In [163]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [184]:
transformer = ColumnTransformer(transformers=[
    ('tnf1', OrdinalEncoder(categories=[['Unknown', 'Uneducated','High School','College','Graduate','Post-Graduate','Doctorate']]), ['Education_Level']),
    ('tnf2', OneHotEncoder(drop='first', sparse_output=False) , ['Attrition_Flag','Gender','Marital_Status','Card_Category'])
], remainder='passthrough')

In [194]:
transformer.fit_transform(df)

array([[ 2.,  1.,  1., ...,  0.,  0., 45.],
       [ 4.,  1.,  0., ...,  0.,  0., 49.],
       [ 4.,  1.,  1., ...,  0.,  0., 51.],
       ...,
       [ 1.,  1.,  1., ...,  0.,  0., 53.],
       [ 2.,  0.,  1., ...,  0.,  0., 68.],
       [ 3.,  1.,  1., ...,  0.,  0., 59.]])