# Dataset Exploration

Dependency Installation

In [None]:
%pip install pandas matplotlib seaborn scikit-learn

In [None]:
%pip install pandas

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


## Dataset Overview

### Statistical Summary of Entire Dataset

In [None]:
df = pd.read_csv("Dataset/MBA.csv")
df.head()

In [None]:
print("Shape:", df.shape)
df.info() #ChatGPT

print("Statistical summary")
df.describe()  

### Statistical Summary of Individual Features

In [None]:
# gender,international,gpa,major,race,gmat,work_exp,work_industry,admission

#### Summary of Gender

In [None]:
print("Counts")
individual_counts = df["gender"].value_counts()
print(individual_counts)

print(" \nProportion")
prop = df["gender"].value_counts(normalize=True)
print(prop)

#### Summary for international feature

In [None]:
print("Counts")
individual_counts = df["international"].value_counts()
print(individual_counts)

print(" \nProportion")
prop = df["international"].value_counts(normalize=True)
print(prop)

#### Summary for GPA


In [None]:
df["gpa"].describe()

#### Summary for Majors

In [None]:
print("Counts")
individual_counts = df["major"].value_counts()
print(individual_counts)

print(" \nProportion")
prop = df["major"].value_counts(normalize=True)
print(prop)

#### Race

In [None]:
print("Counts")
individual_counts = df["race"].value_counts()
print(individual_counts)

print(" \nProportion")
prop = df["race"].value_counts(normalize=True)
print(prop)

#### GMAT Score

In [None]:
df["gmat"].describe()


#### Work experience

In [None]:
df["work_exp"].describe()


#### Work Industry

In [None]:
print("Counts")
individual_counts = df["work_industry"].value_counts()
print(individual_counts)

print(" \nProportion")
prop = df["work_industry"].value_counts(normalize=True)
print(prop)

## Data Visualization

### Univariate Graphs

In [None]:
df.isnull().sum()
sns.heatmap(df.isnull(), cbar=False, cmap="viridis") #ChatGPT

Yellow represents null values

purple represents actual values

From the above diagram we can see the admission attribute has a lot of missing data whilst the race column has some missing data. In the description of the dataset from kaggle it was stated that in the admission column null represents Deny or Rejected and in the race colum null represents an international student


In [None]:
# Example for gender
sns.countplot(x='gender', data=df)
plt.title('Gender Distribution')
plt.show()

In [None]:
sns.countplot(x='international', data=df)
plt.title('International Distribution')
plt.show()

In [None]:
sns.countplot(x='major', data=df)
plt.title('Major Distribution')
plt.show()

In [None]:
sns.countplot(x='race', data=df)
plt.title('Distribution of Race')
plt.show()

In [None]:
df['admission'] = df['admission'].fillna('Rejected')

In [None]:
sns.countplot(x='admission', data=df)
plt.title('Distribution of Admission')
plt.show()

In [None]:

df['race'] = df['race'].fillna('International')

df['race'] = df['race'].replace('Other', 'International')


In [None]:
sns.countplot(x='race', data=df)
plt.title('Distribution of Race')
plt.show()


In [None]:
# Step 1: Get value counts
industry_counts = df['work_industry'].value_counts()

# Step 2: Identify industries with less than 200 entries
to_combine = industry_counts[industry_counts < 200].index

# Step 3: Replace them with 'Other'
df['work_industry'] = df['work_industry'].replace(to_combine, 'Other') #ChatGPT


In [None]:
plt.figure(figsize=(12, 6))
ax = sns.countplot(x='work_industry', data=df)
plt.title('Distribution of Various Work Industries (Grouped Low Frequencies as Other)')

plt.xticks(rotation=65)
plt.tight_layout()
plt.show()


In [None]:
# GPA Histogram
sns.histplot(df['gpa'], kde=True)
plt.title('GPA Distribution')
plt.show()

In [None]:
# GPA Histogram
sns.histplot(df['gmat'])
plt.title('GMAT Distribution')
plt.show()

In [None]:
# GPA Histogram
sns.histplot(df['work_exp'])
plt.title('Work Experience Distribution')
plt.show()

In [None]:
sns.boxplot(x=df['gpa'])
plt.title('GPA Boxplot')
plt.show()

In [None]:

admitted_df = df[df['admission'].isin(['Admit', 'Waitlist'])]

admitted_df = admitted_df.reset_index(drop=True)
#ChatGPT


In [None]:

admitted_df.to_csv('admitted_or_waitlist.csv', index=False)


In [None]:
print(df.duplicated().sum())

### Bivariate Graphs

# Data Pre-processing

## Dataset Split


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('admission', axis=1)
y = df['admission']

X_train_universe, X_test, y_train_universe, y_test = train_test_split(X, y, test_size=0.1, random_state=10,stratify=y)#Chatgpt

X_train_set, X_validation_set, Y_train_set, Y_validation_set = train_test_split(X_train_universe, y_train_universe, test_size=0.1, random_state=10,stratify=y_train_universe)


In [None]:
print("Shape of Test Set:", X_test.shape)
df.info()

print("Shape of Validation Set:", X_validation_set.shape)
df.info()

print("Shape of Training Set:", X_train_set.shape)
df.info()

## Normalization

In [None]:
normalized_df = df.copy() 

# Z- Score Normalization


# Selecting only the quantitative columns for the Z-score normalization
columns_to_be_normalized = ['gpa', 'gmat', 'work_exp']
  

for column in columns_to_be_normalized: 
    normalized_df[column] = (normalized_df[column] - normalized_df[column].mean()) / normalized_df[column].std()

normalized_df.head()


# One Hot Encoding

categorical_columns = ['gender', 'international', 'major', 'race', 'work_industry']


# This dictionary stores the names of each categorical columns as a key and assigns a list of all unique responses as a value
unique_values_dict = {}

for col in categorical_columns:
    unique_values_dict[col] = df[col].unique()


for col, unique_values in unique_values_dict.items():
    for val in unique_values:
        new_col_name = f"{col}{str(val).replace('/', '').replace(' ', '_')}"

        df[new_col_name] = (df[col] == val).astype(int)

# After the one-hot encoding clearing the initial categorical columns from the dataset
df.drop(columns=categorical_columns, inplace=True)


print(df.head())
