In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
df = pd.read_csv('/kaggle/input/eda-project-analyze-us-citizens/adult_eda.csv')

## 1.Try to understand what the data looks like
#### ** Check the head, shape, data-types of the features.
#### ** Check if there are some dublicate rows or not. If there are, then drop them.
#### ** Check the statistical values of features.
#### ** If needed, rename the columns' names for easy use.
#### ** Basically check the missing values.

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# renaming column name if necessary

df.columns = df.columns.str.replace('-','_')
df.columns

In [None]:
df.describe().T

In [None]:
df.duplicated().value_counts()

In [None]:
df = df.drop_duplicates()

In [None]:
df.isnull().sum()

### 2.Look at the value counts of columns that have object datatype and detect strange values apart from the NaN Values

In [None]:
df.describe(include = object)

In [None]:
obj_col = df.loc[:, df.dtypes == object].columns
obj_col

In [None]:
df.education.value_counts()

In [None]:
for col in obj_col:
    print(col)
    print(df[col].value_counts())
    print('------------------------------------------')

In [None]:
df.salary.value_counts()

In [None]:
print((df.astype(str)=='?').sum())

## any() - gives bool value
## sum() - gives no.of values present

## Examine all features (first target feature("salary"), then numeric ones, lastly categoric ones) separetly from different aspects according to target feature.

### to do list for numeric features:

##### ** Check the boxplot to see extreme values
##### ** Check the histplot/kdeplot to see distribution of feature
##### ** Check the statistical values
##### ** Check the boxplot and histplot/kdeplot by "salary" levels
##### ** Check the statistical values by "salary" levels
##### ** Write down the conclusions you draw from your analysis

### to do list for categoric features:

##### ** Find the features which contains similar values, examine the similarities and analyze them together
##### ** Check the count/percentage of person in each categories and visualize it with a suitable plot
##### ** If need, decrease the number of categories by combining similar categories
##### ** Check the count of person in each "salary" levels by categories and visualize it with a suitable plot
##### ** Check the percentage distribution of person in each "salary" levels by categories and visualize it with suitable plot
##### ** Check the count of person in each categories by "salary" levels and visualize it with a suitable plot
##### ** Check the percentage distribution of person in each categories by "salary" levels and visualize it with suitable plot
##### ** Write down the conclusions you draw from your analysis

## Salary

In [None]:
df.salary.value_counts()

In [None]:
plt.figure(figsize=(8,5))
sal_plot = sns.countplot(x='salary',data=df)
for sal in sal_plot.patches:
    sal_plot.annotate((sal.get_height()),(sal.get_x()+0.4, sal.get_height()+1),fontsize=10);

In [None]:
per_sal = df['salary'].value_counts()/len(df)
per_sal

In [None]:
plt.figure(figsize=(10,5))

plt.pie(per_sal,
       labels=['<=50k','>50k'],
       explode = [0,0.1],
       shadow = True,
       autopct = '%0.1f%%',
       colors = ['lightskyblue','gold'])
plt.title('Percentage of Income_Levels',fontsize=14);

#### Result : 75.9% of the people's salary is lesser than 50K and rest of 24.1% of the people's salary is greater than 50K

### comparing all the other numerical features with the salary and finding the final conclusion / result

## Numeric Data

#### For a numeric or continous data column we need to use either the hist,kde,box plots

## Age

In [None]:
# kde can also be included in the histplot

plt.figure(figsize=(8,5))
sns.histplot(x='age', data = df, kde = True , bins = 25)

In [None]:
# boxplot also indicates the 3 quartiles (i.e q1 ,q2 ,q3)

plt.figure(figsize=(6,5))
sns.boxplot(x='age',data = df)

In [None]:
# comparing both the age and the salary data

plt.figure(figsize=(8,5))
sns.histplot(x='age',hue='salary',kde=True,data=df,bins=25)

In [None]:
plt.figure(figsize=(9,5))
sns.boxplot(x='salary',y='age',data=df)

In [None]:
df.groupby('salary')['age'].describe()

#### Result : The mean and median age of the high-income group is higher than the low-income group. It means the older generation possesses more wealth than the young.

## fnlwgt

#### check the box plot to see the extreme values

In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(x='fnlwgt',data=df,fill=True)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='fnlwgt',data=df)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='salary',y='fnlwgt',data=df)

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(x='fnlwgt',hue='salary',data=df,kde=True,bins=25)

In [None]:
df.groupby('salary')['fnlwgt'].describe()

#### Result : There is no significant difference between high and low-income groups according to the "fnlwgt" feature.

## Capital_gain

In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(x='capital_gain',data=df,fill=True)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='capital_gain',data=df)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='salary',y='capital_gain',data=df)

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(x='capital_gain',hue='salary',data=df,kde=True,bins=15)

In [None]:
df.groupby('salary')['capital_gain'].describe()

#### Check the statistical values by "salary" levels for capital_gain not equal the zero

In [None]:
df.loc[df['capital_gain'] != 0].groupby('salary')['capital_gain'].describe()

#### Result : The "capital-gain" feature has not provided very meaningful insights. Nevertheless, we can say that the more "capital-gain", the more "high-income".

## Capital_loss

In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(x='capital_loss',data=df,fill=True)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='capital_loss',data=df)

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(x='capital_loss',hue='salary',data=df,kde=True,bins=20)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='salary',y='capital_loss',data=df)

In [None]:
df.groupby('salary')['capital_loss'].describe()

In [None]:
df.loc[df['capital_loss']!=0].groupby('salary')['capital_loss'].describe()

#### Result : there is no significant difference between high and low_income groups according to the capital_loss

## Hours_per_week


In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(x='hours_per_week',data=df,fill=True)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='hours_per_week',data=df)

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(x='hours_per_week',hue='salary',data=df,kde=True,bins=20)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='salary',y='hours_per_week',data=df)

In [None]:
df.groupby('salary')['hours_per_week'].describe()

#### Result : To be in a high-income group, almost at least 40 hours working per week is required. Most of those who work less than 40 hours per week are in the low-income group.

## education_num

In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(x='education_num',data=df,fill=True)

In [None]:
plt.subplots(figsize=(8,5))
sns.boxplot(x='education_num',data=df)

In [None]:
plt.subplots(figsize=(8,5))
sns.kdeplot(x='education_num',data=df,hue='salary',fill=True)

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='salary',data=df,y='education_num')

In [None]:
df.groupby('education_num')['salary'].value_counts()

## Categorical Featuers

## Education

In [None]:
df.education.unique()

In [None]:
df.education.value_counts(dropna=False)

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
sns.countplot(x='education',data=df,color='skyblue')
ax.set_xticklabels(df.education.unique(),rotation=90);

### Check the count of person in each "salary" levels by the feature education and visualize them with countplot

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
sns.countplot(x='education',data=df,hue='salary')
ax.set_xticklabels(df.education.unique(),rotation=90);

In [None]:
df.groupby('education')['salary'].value_counts(dropna=False)

## To plot a piechart

### Decrease the number of categories in "education" feature as low, medium, and high level and create a new feature with this new categorical data.

In [None]:
def mapping_education(x):
    if x in ["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th"]:
             return "low_level_grade"
    if x in ["HS-grad", "Some-college", "Assoc-voc", "Assoc-acdm"]:
             return "medium_level_grades"
    if x in ["Bachelors", "Masters", "Prof-school", "Doctorate"]:
             return "high_level_grade"

In [None]:
df.education.apply(mapping_education).value_counts()

In [None]:
df['education_summary']=df.education.apply(mapping_education)
df.head(2)

In [None]:
plt.subplots(figsize=(8,5))
edu_lev = sns.countplot(x='education_summary',data=df)

In [None]:
plt.subplots(figsize=(8,5))
sns.countplot(x='education_summary',data=df,hue='salary')

In [None]:
df.salary.value_counts()

In [None]:
df.groupby('education_summary')['salary'].value_counts()

In [None]:
(df.groupby('education_summary')['salary']).count()

In [None]:
percentile = (df.groupby('education_summary')['salary']).value_counts()/ (df.groupby('education_summary')['salary']).count()
percentile

In [None]:
high= percentile[:2]
high


In [None]:
low=percentile[2:4]
low

In [None]:
medium=percentile[4:]
medium

In [None]:
fig,(ax1,ax2,ax3) = plt.subplots(ncols=3,figsize=(12,6))
ax1.pie(high, labels=['<=50K','>50K'],autopct='%0.1f%%',colors=['lightblue','yellow'])
ax1.set_title('high_level_edu',fontsize=10)
ax1.set_ylabel('salary')
ax2.pie(low, labels=['<=50K','>50K'],autopct='%0.1f%%',colors=['lightblue','yellow'])
ax2.set_title('low_level_edu',fontsize=10)
ax2.set_ylabel('salary')
ax3.pie(medium, labels=['<=50K','>50K'],autopct='%0.1f%%',colors=['lightblue','yellow'])
ax3.set_title('medium_level_edu',fontsize=10)
ax3.set_ylabel('salary')

## Martial Status

In [None]:
df.marital_status.unique()

In [None]:
df.marital_status.value_counts()

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
sns.countplot(x='marital_status',data=df)
ax.set_xticklabels(df.marital_status.unique(),rotation=90);

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
sns.countplot(x='marital_status',data=df,hue='salary')
ax.set_xticklabels(df.marital_status.unique(),rotation=90);

In [None]:
def marital_arranging(x):
    if x in ['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse']:
        return 'Married'
    if x in ['Divorced','Separated','Widowed']:
        return 'Divorced'
    if x in ['Never-married']:
        return 'Never Married'

In [None]:
df['Marital_Status']=df.marital_status.apply(marital_arranging)

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
sns.countplot(x='Marital_Status',data=df)
ax.set_xticklabels(df.Marital_Status.unique(),rotation=90);

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
sns.countplot(x='Marital_Status',data=df,hue='salary')
ax.set_xticklabels(df.Marital_Status.unique(),rotation=90);

In [None]:
df.Marital_Status.value_counts()

In [None]:
df.groupby('Marital_Status')['salary'].value_counts()

In [None]:
marital_per=df.groupby('Marital_Status')['salary'].value_counts()/df.groupby('Marital_Status')['salary'].count()
marital_per

In [None]:
Divorced = marital_per[:2]
Divorced

In [None]:
Married = marital_per[2:4]
Married

In [None]:
Never_Married = marital_per[4:]
Never_Married

In [None]:
fig,(plot1,plot2,plot3)=plt.subplots(ncols=3,figsize=(10,6));
plot1.pie(Married,autopct='%.1f%%',labels=['<=50K','>50K'],explode=[0,0.1],shadow=True,colors=(['yellow','red']))
plot1.set_title('Married')
plot1.set_ylabel('salary')
plot2.pie(Divorced,autopct='%.1f%%',labels=['<=50K','>50K'],explode=[0,0.1],shadow=True,colors=(['yellow','red']))
plot2.set_title('Divorced')
plot2.set_ylabel('salary')
plot3.pie(Never_Married,autopct='%.1f%%',labels=['<=50K','>50K'],explode=[0,0.1],shadow=True,colors=(['yellow','red']))
plot3.set_title('Never_Married')
plot3.set_ylabel('salary');

## Sex

In [None]:
df.sex.unique()

In [None]:
df.sex.value_counts()

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
sns.countplot(x='sex',data=df)

In [None]:
fig,ax = plt.subplots(figsize=(8,5))
sns.countplot(x='sex',data=df,hue='salary')

In [None]:
df.groupby('sex')['salary'].value_counts()

In [None]:
sex_per = df.groupby('sex')['salary'].value_counts()/df.groupby('sex')['salary'].count()
sex_per

In [None]:
Female = sex_per[:2]
print(Female)
Male = sex_per[2:]
print(Male)

In [None]:
fig , (plot1,plot2) = plt.subplots(ncols=2,figsize=(8,6))
plot1.pie(Female,labels=['<=50K','>50K'],autopct='%.1f%%',shadow=True,explode=[0,0.1])
plot1.set_title('Female',fontsize=10)
plot1.set_ylabel('salary')
plot2.pie(Male,labels=['<=50K','>50K'],autopct='%.1f%%',shadow=True,explode=[0,0.1])
plot2.set_title('Male')
plot2.set_ylabel('salary')

#### Result : The percentage of male is more who earns >50K than female

In [None]:
df.head(2)

## Data Cleaning

### 1. Handling Missing values

In [None]:
df.isnull().sum()

In [None]:
df.dropna(axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.workclass.value_counts()

In [None]:
df.occupation.value_counts()

In [None]:
df.replace(to_replace='?',value=np.nan,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
df.isnull().sum()

## Handling Outliers

In [None]:
plt.figure(figsize=(8,6))
df.boxplot()

In [None]:
fig,ax = plt.subplots(nrows=2,ncols=3,figsize=(12,6))

sns.boxplot(ax=ax[0, 0], data=df, x='age')
sns.boxplot(ax=ax[0, 1], data=df, x='fnlwgt')
sns.boxplot(ax=ax[0, 2], data=df, x='capital_gain')
sns.boxplot(ax=ax[1, 0], data=df, x='capital_loss')
sns.boxplot(ax=ax[1, 2], data=df, x='hours_per_week')


In [None]:
df.corr()

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(),annot=True)