# <center> CCT College Dublin </center>

## <center> Assessment Cover Page</center>


<br><br><br>
---

<span style="font-size:larger;">

**Module Title:**&nbsp;&nbsp;&nbsp;Data Preparation
	
**Assessment Title:**&nbsp;&nbsp;&nbsp;Machine Learning (10 ETCS)
	
**Lecturer Name:**&nbsp;&nbsp;&nbsp;Dr. Muhammad Iqbal
	
**Student Full Name:**&nbsp;&nbsp;&nbsp;Yumiko Maria Bejarano Azogue 
	
**Student Number:**&nbsp;&nbsp;&nbsp;2024144
	
**Assessment Due Date:**&nbsp;&nbsp;&nbsp;21st April 2024
	
**Date of Submission:**&nbsp;&nbsp;&nbsp;21st April 2024
    
</span> 

<br><br><br>

---

#### Declaration 

```
By submitting this assessment, I confirm that I have read the CCT policy on Academic Misconduct and understand the implications of submitting work that is not my own or does not appropriately reference material taken from a third party or other source. I declare it to be my own work and that all material from third parties has been appropriately referenced. I further confirm that this work has not previously been submitted for assessment by myself or someone else in CCT College Dublin or any other higher education institution.
```
<br><br><br>

--- 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

from scipy import stats
from scipy.stats import shapiro
from scipy.stats import chi2_contingency
from scipy.stats import chi2


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, auc, roc_curve, f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold



In [None]:
# Import thi slibrary to suppress the warnings
import warnings
warnings.filterwarnings('ignore')   # The object 'warnings' is used to call the method 'filterwarnings' and ignore the warnings

#sns.set(style="darkgrid")

In [None]:
# Import the data file
data = pd.read_csv('collegePlace.csv')

# The dataset has the following columns:

* Age : Age At The Time Of Final Year
* Gender : Gender Of Candidate
* Stream : Engineering Stream That The Candidate Belongs To
* Internships : Number Of Internships Undertaken During The Course Of Studies, Not Necessarily Related To College Studies Or Stream
* CGPA : CGPA Till 6th Semester
* Hostel : Whether Student Lives In College Accomodation
* HistoryOfBacklogs : Whether Student Ever Had Any Backlogs In Any Subjects
* PlacedOrNot : Target Variable

In [None]:
# Display the first 5 records of the dataset
print("First 5 records of the dataset:")
print(data.head())

# Meta information of Dataframe

In [None]:
# Information about the dataframe
print("\nInformation about the dataframe:")
print(data.info())

In [None]:
# Descriptive statistics of the dataframe
print("\n Descriptive statistics of the dataframe:")

# Generate descriptive statistics for the DataFrame and transpose it for readability
descriptive_stats = data.describe().T
#print(descriptive_stats)

# Apply a bar chart style to the 'mean' column
styled_stats = descriptive_stats.style.bar(subset=['mean'], color='#205ff2')

# Apply a background gradient based on standard deviation
styled_stats = styled_stats.background_gradient(subset=['std'], cmap='Reds')

# Apply a background gradient based on the 50th percentile
styled_stats = styled_stats.background_gradient(subset=['50%'], cmap='coolwarm')

# Display the styled statistics
styled_stats


In [None]:
# Descriptive statistics of the dataframe
print("\nDescriptive statistics of the dataframe:")
print(data.describe().loc[['mean', 'min', 'max']].T)

### Descriptive Statistics Summary:

- Age ranges from 19 to 30 years old.
- The lowest number of internships recorded is 0 (no internships), while the highest is 3.
- Most students did not reside in a hostel (average hostel occupancy is below 0.5).
- Most students have no backlogs (average backlog count is below 0.5).
- The majority of students have been successfully placed in jobs (placement rate is above 0.5).
0.5).

In [None]:
# Renamed Columns
data.rename(columns={'HistoryOfBacklogs': 'backlogs', 'PlacedOrNot': 'placed'}, inplace=True)

In [None]:
# # Custom functions definitionabs
# def get_scores(y, y_pred):
#     data={'Accuracy': np.round(accuracy_score(y, y_pred),2),
#     'Precision':np.round(precision_score(y, y_pred),2),
#     'Recall':np.round(recall_score(y, y_pred),2),
#     'F1':np.round(f1_score(y, y_pred),2),
#     'ROC AUC':np.round(roc_auc_score(y, y_pred),2)}
#     scores_df = pd.Series(data).to_frame('scores')
#     return scores_df

In [None]:
#  transformed into lowercase
data = data.rename(columns=str.lower)

###  Checking for NaN values

Fortunately data has no missing value


In [None]:
# Checking if our dataset contains any NULL values
print("\nChecking for missing values:")
print(data.isnull().sum())

In [None]:
# Detecting the duplicates
# data.duplicated().sum() #1829

In [None]:
# data.drop_duplicates(inplace=True)
# data.duplicated().sum()

# Exploratory Data Analysis (EDA) and visualization

In [None]:
# Define palette with colors for placed and not placed
palette =['#d74a49', '#92ba92'] # (yes,no)
#92ba92

In [None]:
numeric_df = data.select_dtypes(include=['number'])

In [None]:
# list of numerical variables............
numerical_features = data.select_dtypes(include=['number'])

print('Number of numerical variables: ', len(numerical_features))
print('\n')

print('Numeric Column names:', numerical_features.columns)
print('\n')

# visualise the numerical variables........
data[numerical_features.columns].head()


In [None]:
# Calculates Skewness measures the asymmetry of a distribution
skewness = numeric_df.skew()
skewness

### Age Distribution by Gender

In [None]:
# Creating a DataFrame to count the occurrences of each unique 'age' value in the entire dataset
age_counts_df = pd.DataFrame(data['age'].value_counts()).reset_index()

# Renaming the columns of the DataFrame
age_counts_df.columns = ['Unique Age Values', 'Counts']

# Displaying the DataFrame showing the count of unique 'age' values in the entire dataset
print(age_counts_df)

In [None]:
# Grouping the data by 'gender' and counting the occurrences of each unique 'age' value within each group
age_counts_by_gender = data.groupby('gender')['age'].value_counts().reset_index(name='Counts')

# Displaying the DataFrame showing the count of unique 'age' values for each gender
print(age_counts_by_gender)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting the data
plt.figure(figsize=(10, 6))

# Plotting age distribution for males (Lightblue)
sns.barplot(x='age', y='Counts', data=age_counts_by_gender[age_counts_by_gender['gender'] == 'Male'], color='#a2d2ff', label='Male')

# Plotting age distribution for females (Magenta)
sns.barplot(x='age', y='Counts', data=age_counts_by_gender[age_counts_by_gender['gender'] == 'Female'], color='#faaac7', label='Female')

# Adding total count in each bar
for p in plt.gca().patches:
    plt.gca().annotate(f"{int(p.get_height())}", (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Adding labels and title
plt.xlabel('Age')
plt.ylabel('Counts')
plt.title('Age Distribution by Gender')

plt.legend()
plt.show()


## Placement Details by Gender

In [None]:
# Placement Details by Gender
# Data extraction
male = data[data['gender'] == "Male"]
female = data[data['gender'] == "Female"]
total_male = male.shape[0]
total_female = female.shape[0]
total_male_pass = male[male['placed'] == 1].shape[0]
total_female_pass = female[female['placed'] == 1].shape[0]

In [None]:
# Calculation of pass percentages
pass_male_percentage = np.round((total_male_pass * 100) / total_male, 2)
pass_female_percentage = np.round((total_female_pass * 100) / total_female, 2)


In [None]:
# Details dictionary
details = {"Total Male": [total_male],
           "Total Female": [total_female],
           "Total male pass": [total_male_pass],
           "Total female pass": [total_female_pass],
           "% of Passed Male": [pass_male_percentage],
           "% of Passed Female": [pass_female_percentage]}

details

In [None]:
# Plotting the details in a bar chart
fig, ax = plt.subplots(figsize=(10, 6))

# Data for plotting
categories = list(details.keys())
values = list(details.values())  # Convert dict_values object to a list
values = [item for sublist in values for item in sublist]  # Flatten the list
colors = ['#a2d2ff', '#faaac7']

# Plotting the data
bars = ax.bar(categories, values, color=colors, alpha=0.7)

# Adding text on each bar
for bar in bars:
    height = bar.get_height()
    ax.annotate('{}'.format(height),
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom')

ax.set_ylabel('Count')
ax.set_title('Placement Details by Gender')

# Custom legend with both male and female labels
female_patch = plt.Line2D([0], [0], marker='o', color='w', label='Female', markerfacecolor='#faaac7', markersize=10)
male_patch = plt.Line2D([0], [0], marker='o', color='w', label='Male', markerfacecolor='#a2d2ff', markersize=10)

ax.legend(handles=[male_patch, female_patch])

# Display the plot
plt.show()




In [None]:
# Drop non-numeric columns before calculating correlation
numeric_data = data.select_dtypes(include=[np.number])
correlation_matrix = numeric_data.corr()

correlation_matrix

In [None]:
# Plot the correlation matrix heatmap
sns.heatmap(correlation_matrix, cmap='RdBu', annot=True, vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

In [None]:
def num_plot(df, col, title, palette):
    fig, ax = plt.subplots(2, 1, figsize=(5.2, 5), gridspec_kw={"height_ratios": (.1, .9)})
    ax[0].set_title(title, fontsize=18)
    sns.boxplot(y=pd.to_numeric(df['placed']), x=col, data=df, orient='h', ax=ax[0], palette=palette)
    ax[0].set(yticks=[])
    ax[0].set_ylabel('')
    ax[0].set_xlabel('')
    sns.countplot(x=col, data=df, ax=ax[1], hue=pd.to_numeric(df['placed']), palette=palette)
    ax[1].set_xlabel(col, fontsize=16)
    ax[1].set_yticks([])
    for container in ax[1].containers:
        ax[1].bar_label(container, fmt='%.1f')
    plt.legend(title='Placed?', title_fontsize=14, labels=['no', 'yes'], fontsize=13, fancybox=True, shadow=True, frameon=True)
    plt.tight_layout()
    plt.show()


In [None]:
# CGPA affect job placement.
num_plot(data, 'cgpa', 'CGPA by Placed',palette)



In [None]:
data.loc[(data['cgpa'] == 5) & (data['placed'] == 1)]

In [None]:
df_1 = data.loc[((data['age'] == 23) | (data['age'] == 24)) & (data['gender'] == 'Male') &
 ((data['stream'] == 'Information Technology') | (data['stream'] == 'Computer Science')) & (data['placed'] == 1)]

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,3))
sns.boxplot(ax=ax[0], x='cgpa', data=df_1)
sns.histplot(ax=ax[1], x='cgpa', data=df_1)
plt.show()

In [None]:
# Removed outliers
df_clean = data.drop(list(data.loc[(data['cgpa'] == 5) & (data['placed'] == 1)].index))

print('Removed {} outliers !'.format(len(data)-len(df_clean)))

In [None]:
#plots cleaned dataset
num_plot(df_clean, 'cgpa', 'CGPA by Placed', palette)


In [None]:
df_clean.shape

In [None]:
df_clean.info()

# EncodingSince there is no order/hierarchy among the categorical features, they will be encoded by *One hot encoding* (using pd.get_dummies).

In [None]:
df_encoded = pd.get_dummies(df_clean, drop_first=True)

df_encoded.head()

## Splitting the data into training, testing, and validation sets   

The last step in data preprocessing is to split the data into training, testing, and validation sets:
* Training set: The neural network will be trained on this subset of the data.
* Validation set: This set of data allows us to perform hyperparameter tuning (that is, tuning the number of hidden layers) using an unbiased source of data.
* Testing set: The final evaluation of the neural network will be based on this subset of the data.

# First, let's separate the dataset into X (input features) and y (target variable):

In [None]:
# splitting the dataset into x(independent variables) and y(dependent variables)
X = df_encoded.drop('placed', axis=1)

y = df_encoded['placed']

print(X.shape)
print(y.shape)

In [None]:
X.shape

#### Then, make the first split to split the data into the training set (80%) and the testing set (20%) according to the preceding diagram:

In [None]:
# splitting the data into training and testing set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# Feature Scaling 
# Only on Independent Variable to convert them into values ranging from -1 to +1

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test  = sc.fit_transform(X_test)

X_train = pd.DataFrame(X_train)
X_train.head()


# Model building

In [None]:
# Fixing a random seed ensures reproducible results
import numpy as np
import random
import tensorflow
tensorflow.random.set_seed(9)
np.random.seed(1)
random.seed(1)

In [None]:
import keras
from keras.models import Sequential

model = Sequential()

In [None]:
# Let's add the first hidden layer:
from keras.layers import Dense

colum = X_train.shape[1]

# Add the first hidden layer
model.add(Dense(32, activation = 'relu', input_dim = colum))


In [None]:
# Add the second hidden layer
model.add(Dense(16, activation = 'relu'))

In [None]:
# Now we add the output layer as follows:
# Add the output layer
model.add(Dense(1, activation = 'sigmoid'))

# Model compilation

In [None]:
# Then, we can run the compile() function as follows:
    # Compile the model
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

# Model training

In [None]:
X_train.shape

In [None]:
# To train our MLP model defined in earlier steps, let's call the fit function. Let's train our model for 200 iterations:
# Train the model for 200 epochs
model.fit(X_train, y_train, epochs = 10) # 200

# Results analysis
Having successfully trained our MLP, let's evaluate our model based on the testing accuracy, confusion matrix, and receiver operating characteristic (ROC) curve.

# Testing accuracy
We can evaluate our model on the training set and testing set using the evaluate() function:

In [None]:
scores = model.evaluate(X_train, y_train)
print("Training Accuracy: %.2f%%\n" % (scores[1]*100))

scores = model.evaluate(X_test, y_test)
print("Testing Accuracy: %.2f%%\n" % (scores[1]*100))

# Confusion matrix

The confusion matrix is a useful visualization tool that provides analysis on the true negative, false positive, false negative, and true positives made by our model. Beyond a simple accuracy metric, we should also look at the confusion matrix to understand the performance of the model.The definition of true negative, false positive, false negative, and true positives are as follows:


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

y_test_pred = model.predict(X_test)
y_test_pred = y_test_pred.flatten()
y_test_pred_new = np.where(y_test_pred.round(2) > 0.5, 1, 0)
c_matrix = confusion_matrix(y_test, y_test_pred_new)
c_matrix

In [None]:
ax = sns.heatmap(c_matrix, annot=True,
                 xticklabels=['No', 'Yes'],
                 yticklabels=['No', 'Yes'],
                 cbar=False, cmap='Blues')
ax.set_xlabel("Prediction")
ax.set_ylabel("Test")
ax.set_title("Placed ?")
plt.show()

# ROC curve
A receiver operating characteristic curve (ROC) is a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied.

For classification tasks, we should also look at the ROC curve to evaluate our model. The ROC curve is a plot with the True Positive Rate (TPR) on the y axis and the False Positive Rate (FPR) on the x axis.

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
y_test_pred_probs = model.predict(X_test)

In [None]:
#Then, run the roc_curve function in order to get the corresponding false positive rate and true positive rate for the ROC curve:
FPR, TPR, _ = roc_curve(y_test, y_test_pred_probs)

In [None]:
plt.plot(FPR, TPR)
plt.plot([0,1],[0,1],'--', color='black') #diagonal line
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# Discussion
From the preceding ROC Curve, we can see that the model performs rather well, close to the model ROC Curve shown in the preceding diagram. This shows that our model is able to differentiate samples of different classes, making good predictions.

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

model = DecisionTreeClassifier() 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuaracy :", model.score(X_test, y_test))

cm = confusion_matrix(y_test, y_pred)
print(cm)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))

cm = confusion_matrix(y_test, y_pred)
print(cm)

## k fold Cross Validation

In [None]:
# k fold cross validatio

from sklearn.model_selection import cross_val_score

cvs = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)
print(cvs, "\n Mean Accuracy :", cvs.mean(), "\nStandard Deviation :", cvs.std())

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))

cm = confusion_matrix(y_test, y_pred)
print(cm)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Accuracy :", model.score(X_train, y_train))
print("Testing Accuracy :", model.score(X_test, y_test))

cm = confusion_matrix(y_test, y_pred)
print(cm)

## k fold Cross Validation

In [None]:
# k fold cross validatio

from sklearn.model_selection import cross_val_score

cvs = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)
print(cvs, "\n Mean Accuracy :", cvs.mean(), "\n Standard Deviation :", cvs.std())

# Reference:
Neural Network Projects with Python by James Loy Published by Packt Publishing, 2019
https://www.analyticsvidhya.com/