In [133]:
# import required libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [134]:
# load and read dataset
dataset = pd.read_csv('Train.xls')
dataset.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [135]:
# shape of dataset
dataset.shape

(8068, 11)

In [136]:
# find the null values
dataset.isnull().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [137]:
# find other information
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [138]:
dataset['Ever_Married'].value_counts()

Ever_Married
Yes    4643
No     3285
Name: count, dtype: int64

In [139]:
# as yes are more so put yes in the null values in Ever_Married column
dataset['Ever_Married'] = dataset['Ever_Married'].fillna('Yes')
dataset['Ever_Married']

0        No
1       Yes
2       Yes
3       Yes
4       Yes
       ... 
8063     No
8064     No
8065     No
8066     No
8067    Yes
Name: Ever_Married, Length: 8068, dtype: object

In [140]:
dataset.Ever_Married.value_counts()

Ever_Married
Yes    4783
No     3285
Name: count, dtype: int64

In [141]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     8068 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [142]:
# find unique values in graduated column
dataset['Graduated'].value_counts()

Graduated
Yes    4968
No     3022
Name: count, dtype: int64

In [143]:
# as mode in graduated column is yes, so lets fill the null values with yes
dataset.Graduated = dataset.Graduated.fillna('Yes')
dataset['Graduated']

0        No
1       Yes
2       Yes
3       Yes
4       Yes
       ... 
8063     No
8064     No
8065    Yes
8066    Yes
8067    Yes
Name: Graduated, Length: 8068, dtype: object

In [144]:
# now look for unique values in Profession column
dataset.Profession.value_counts()

Profession
Artist           2516
Healthcare       1332
Entertainment     949
Engineer          699
Doctor            688
Lawyer            623
Executive         599
Marketing         292
Homemaker         246
Name: count, dtype: int64

In [145]:
# find number of null values in profession column
dataset.Profession.isnull().sum()

124

In [146]:
# lets fill some null values with Artist and others with Healthcare

# create a mask of all the null values in profession column
# prof_mask = dataset.Profession.isnull()
art = 'Artist'
health = 'Healthcare'

# now fill some values with Artist
dataset.loc[dataset['Profession'].isnull() & (dataset.index % 2 == 0), 'Profession'] = art
dataset.loc[dataset['Profession'].isnull() & (dataset.index % 2 != 0), 'Profession'] = health

dataset.Profession

0          Healthcare
1            Engineer
2            Engineer
3              Lawyer
4       Entertainment
            ...      
8063       Healthcare
8064        Executive
8065       Healthcare
8066       Healthcare
8067        Executive
Name: Profession, Length: 8068, dtype: object

In [147]:
dataset.Profession.isnull().sum()

0

In [148]:
dataset.Profession.value_counts()

Profession
Artist           2568
Healthcare       1404
Entertainment     949
Engineer          699
Doctor            688
Lawyer            623
Executive         599
Marketing         292
Homemaker         246
Name: count, dtype: int64

In [149]:
dataset.isnull().sum()

ID                   0
Gender               0
Ever_Married         0
Age                  0
Graduated            0
Profession           0
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [150]:
# now find unique values in Work_Experience and fill them
dataset.Work_Experience.value_counts()

Work_Experience
1.0     2354
0.0     2318
9.0      474
8.0      463
2.0      286
3.0      255
4.0      253
6.0      204
7.0      196
5.0      194
10.0      53
11.0      50
12.0      48
13.0      46
14.0      45
Name: count, dtype: int64

In [151]:
# fill some values in Work_Experience with 1.0, some with 0.0, some with 9.0, and remaining with median, mean
median = dataset['Work_Experience'].median()
mean = round(dataset.Work_Experience.mean(), 1)

dataset.loc[dataset.Work_Experience.isnull() & (dataset.index % 7 == 0), 'Work_Experience'] = 1.0

dataset.loc[dataset.Work_Experience.isnull() & (dataset.index % 5 == 0), 'Work_Experience'] = 0.0

dataset.loc[dataset.Work_Experience.isnull() & (dataset.index % 3 == 0), 'Work_Experience'] = 9.0

dataset.loc[dataset.Work_Experience.isnull() & (dataset.index % 2 == 0), 'Work_Experience'] = 8.0

dataset.loc[dataset.Work_Experience.isnull() & (dataset.index % 2 != 0), 'Work_Experience'] = 2.0

dataset.Work_Experience.isnull().sum()

0