## DATA EXPLORATION WITH PYTHON

An exploratory analysis into adult income, checking out the various variables that might affect the income of a person

In [1]:
# import package

import warnings

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# load the dataset
df = pd.read_csv('adult_income.csv')

df.shape

talk about the shape here

In [3]:
# explore the first 5 rows
df.head()

In [4]:
df[(df['income'] == '>50K') & (df['hours.per.week'] > 40)]

### Renaming columns

In [5]:
df = df.rename(columns={
    'fnlwgt': 'final_weight',
    'workclass': 'work_class',
    'education.num': 'education_num',
    'marital.status': 'marital_status',
    'capital.gain': 'capital_gain',
    'capital.loss': 'capital_loss',
    'hours.per.week': 'hours_per_week',
    'native.country': 'native_country',
})

In [6]:
# explore the dataset columns
columns = df.columns

columns

In [7]:
df['sex'].unique()

In [8]:
df['sex'] = (df['sex'] == 'Male').astype(int)

df['sex'].unique()

In [9]:
df['race'].unique()

In [10]:
df['race'] = df['race'].map({
    'White': 0,
    'Black': 1,
    'Asian-Pac-Islander': 2,
    'Other': 3,
    'Amer-Indian-Eskimo': 4
})

df

### Exploring and Analyzing 'income' column

In [11]:
#descriptive statistics summary
df['income'].describe()

In [12]:
df['income'].value_counts()

In [13]:
df['income'].head()

In [14]:
df['income'] = (df['income'] == '>50K').astype(int)

df['income'].head()

In [15]:
df['income'].value_counts()

In [16]:
# histogram
sns.distplot(df['income']);

In [17]:
#skewness and kurtosis
print("Skewness: %f" % df['income'].skew())
print("Kurtosis: %f" % df['income'].kurt())

### Data Cleaning
In our dataset, we can see that missing values are present in the form of "?".
the columns: "work_class", "occupation" etc. a being affected

In [18]:
def get_columns_with_null(frame: pd.DataFrame):
    return frame.columns[frame.isnull().any(axis=0)]

In [19]:
df.replace('?', np.NAN, inplace=True)

columns_with_null = get_columns_with_null(df)
print("Columns with null values:", columns_with_null)

df.head()

In [20]:
df.describe()

Data Replacement

In [21]:
# replacing the empty values with the mode 
for col in columns_with_null:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [22]:
# confirming that all data have been filled
columns_with_null = get_columns_with_null(df)
print("Columns with null values:", columns_with_null)

Explore other columns

In [23]:
def get_unique_values_and_values_count(frame: pd.DataFrame) -> None:
    # get all columns and exclude the 'income' column (last element)
    columns = frame.columns.values[::-1]

    for column in columns:
        print(f'Column Name: {column}')
        print('\t', df[column].value_counts())
        print('\t', df[column].unique(), '\n\n')

    print('Columns Count: ', len(columns))


get_unique_values_and_values_count(df)

In [24]:
sns.pairplot(df)

### Relationship with numerical variables

In [26]:
# get all the numerical columns in dataframe
numeric_columns = df.select_dtypes(include='number').columns

print(numeric_columns)

In [27]:
# a function to create a scatter plot with income against input column 
def numeric_plot_income_distribution(frame: pd.DataFrame, column: str):
    # Set index if it's not already set
    frame.reset_index(drop=True, inplace=True)

    columns = [column, 'income']

    scaler = StandardScaler()
    frame_scaled = pd.DataFrame(scaler.fit_transform(frame[columns]), columns=columns)

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=frame_scaled, y=column, x='income')
    sns.regplot(data=frame_scaled, y=column, x='income', scatter=False, color='red')


In [28]:
for column in numeric_columns:
    if column == 'income':
        continue
    numeric_plot_income_distribution(df, column)

### Relationship with categorical features

In [29]:
# get categorical columns
columns_set = set(columns)
numeric_columns_set = set(numeric_columns)

categorical_columns = list(columns_set - numeric_columns_set)
categorical_columns

In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


# a function to create a scatter plot with income against input column 
def categorical_plot_income_distribution(frame: pd.DataFrame, column: str):
    numeric_column = 'income'
    categorical_column = column

    # Define preprocessing steps for numeric and categorical columns
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse=False))  # Setting sparse=False to get dense array
    ])

    # Combine preprocessing steps for both types of columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, [numeric_column]),
            ('cat', categorical_transformer, [categorical_column])
        ])

    transformed_data = preprocessor.fit_transform(df[[numeric_column, categorical_column]])

    # Check dimensions and column names
    print("Transformed Data Shape:", transformed_data.shape)
    numeric_scaled_column_name = f"{numeric_column}_scaled"
    categorical_encoded_columns = preprocessor.transformers_[1][1]['onehot'].get_feature_names_out([categorical_column])
    column_names = [numeric_scaled_column_name] + list(categorical_encoded_columns)
    print("Column Names:", column_names)

    # Create DataFrame from transformed data
    transformed_df = pd.DataFrame(transformed_data, columns=column_names)

    # Plot box plot
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=transformed_df)
    plt.xlabel("Features")
    plt.ylabel("Transformed Values")
    plt.title("Box Plot of Transformed Data")
    plt.show()


In [31]:
for column in categorical_columns:
    categorical_plot_income_distribution(df, column)

Age Distribution

In [32]:
df['age'].hist()
plt.show

By observation age attribute is right-skewed and not symmetric
min and max age in btw 17 to 90.

final wieght Distribution:

In [33]:
df['final_weight'].hist()
plt.show()

It seems like Rightly skewed.

capital-gain Distribution

In [34]:
df['capital_gain'].hist()
plt.show()

capital gain shows that either a person has gain or no gain of very large amount(10k or 99k).

capital loss distribution

In [35]:
df['capital_loss'].hist()
plt.show()

This histogram shows that most of the "capital-loss" values are centered on 0 and only few are non zero(2282).
This attribute is similar to the capital-gain i.e. most of the values are centered on 0(nearly 43000 of them)

In [36]:
sns.relplot(x='capital_gain', y='capital_loss', data=df)
plt.xlabel('capital gain')
plt.ylabel('capital loss')
plt.show()

1.both capital-gain and capital-loss can be zero(0)
2.if capital_gain is Zero then capital_loss being high or above zero.
3.if capital_loss is Zero then capital_gain being high or above zero.

In [37]:
df.head()


Hours per week distribution

In [38]:
df['hours_per_week'].hist()
plt.show()

In this data the hours per week attribute varies within the range of 1 to 99.
By observation,30-40 hrs people work per week,around 27000 people.
There are also few people who work 80-100 hours per week and some less than 20 which is unusual.

In [39]:
from typing import Optional


def explore_column_distribution(frame: pd.DataFrame, column: str, total_records: float,
                                hue: Optional[str] = None) -> None:
    plt.figure(figsize=(20, 5))

    if hue is None:
        ax = sns.countplot(x=column, data=frame)
    else:
        ax = sns.countplot(x=column, data=frame, hue=hue)

    for bar in ax.patches:
        bar_height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2.,
                bar_height + 3,
                '{:1.2f}%'.format((bar_height / total_records) * 100),
                ha="center")

    plt.show()

work_class distribution

In [40]:
explore_column_distribution(df, column='work_class', total_records=float(len(df['income'])))

most of them belong to private workclass that is around 75%.
without-play and never-play workclass has min count

Educational distribution

In [41]:
explore_column_distribution(df, column='education', total_records=float(len(df['income'])))

Hs-grad has 32.32% of all the education attribute.
pre-school has min.

Marital status distribution

In [42]:
explore_column_distribution(df, column='marital_status', total_records=float(len(df)))

Married-civ-spouse has maximum number of samples.
Married-AF-spouse has minimum number of obs.

Occupational distribution

In [43]:
explore_column_distribution(df, column='occupation', total_records=float(len(df)))

Prof-specialty has the maximum count.
Armed-Forces has minimum samples in the occupation attribute.

Relationship Distribution

In [44]:
explore_column_distribution(df, column='relationship', total_records=float(len(df)))

Husband has maximum percentage among all.

Race distribution

In [45]:
explore_column_distribution(df, column='race', total_records=float(len(df)))

white is maximun among all about 85.50%.
black is second maximun.

Sex(gender) distribution

In [46]:
explore_column_distribution(df, column='sex', total_records=float(len(df)))

there are 2 unique categories in gender.
frequency of male is higher than female.

Income Distribution

In [47]:
explore_column_distribution(df, column='income', total_records=float(len(df)))

In income there is 2 group, 
group1 = 1(who earns more than 50k) 24.08% belong to income 
group2 = 0(who earns less than 50k) 75.92% belong to income

Exploring other columns with target column (bi-variate analysis)

In [48]:
def bi_variate_box_plot_exploration(frame: pd.DataFrame, x: str, y: str) -> None:
    title = f'{x.title()} vs {y.title()}'
    plt.figure(figsize=(20, 8))
    sns.boxplot(x=x, y=y, data=frame).set_title(title)
    plt.show()


def bi_variate_count_plot_exploration(frame: pd.DataFrame, x: str, y: str) -> None:
    title = f'{x.title()} vs {y.title()}'
    plt.figure(figsize=(20, 7))
    sns.countplot(x=x, hue=y, data=frame).set_title(title)
    plt.show()

Age vs Income

In [49]:
bi_variate_box_plot_exploration(frame=df, x='income', y='age')

Income group(<=50k) has lower median "age"(34 year) than the Income group(>50k) which has median "age"(42 year).

Work_class vs Income

In [50]:
bi_variate_count_plot_exploration(frame=df, x='work_class', y='income')

The data seems to mainly consist private employees.
In All the workclasses number of people earning less then 50k are more then those earning 50k.

Capital gain vs Income

In [51]:
bi_variate_box_plot_exploration(frame=df, x='income', y='capital_gain')

Most of the capital gains value is accumulated at 0 for both the income group .

Capital loss vs Income

In [52]:
bi_variate_box_plot_exploration(frame=df, x='income', y='capital_loss')

In [53]:
bi_variate_count_plot_exploration(frame=df, x='relationship', y='income')

Mostly a person with relation as husband in a family has most count of people with more than 50k income

In [54]:
sns.catplot(y="race", hue="income", kind="count", col="sex", data=df);

It is clear people with Gender male and race as white has the most people with income more than 50k.

In [55]:
explore_column_distribution(df, column='work_class', hue='income', total_records=float(len(df['income'])))

In [56]:
sns.catplot(y="education", hue="income", kind="count", palette="pastel", edgecolor=".6", data=df);

This data mostly consist of people who has education as hs-grad

In [57]:
sns.catplot(y='marital_status', hue='sex', col='income', data=df, kind='count', height=4, aspect=.7)

The people with marital status as Married-civ-spouse has the highest people with income more then 50k.

In [58]:
bi_variate_count_plot_exploration(frame=df, y='income', x='occupation')

In [59]:
bi_variate_count_plot_exploration(frame=df, y='income', x='relationship')


In [60]:
plt.figure(figsize=(20, 7))
sns.catplot(y="race", hue="income", kind="count", col="sex", data=df);

In [61]:
correlation_matrix = df.corr(numeric_only=True)
correlation_matrix

In [62]:
correlation_matrix[["income"]].sort_values('income', ascending=False)

In [63]:
columns