# Exploratory Data Analysis


## Importing in the python libraries

In [3]:
# Data processing
import pandas as pd
# Lineaer algebra library
import numpy as np
# Data visualisation
import matplotlib.pyplot as plt
# Statistical data visualisation
import seaborn as sns
%matplotlib inline

## Read in the data

## Data Description

We will use a data frame with 777 observations on the following 18 variables.
* Apps: Number of applications received
* Accept: Number of applications accepted
* Enroll: Number of new students enrolled
* Top10perc: Percentage of new students from top 10% of H.S. class
* Top25perc: Percentage of new students from top 25% of H.S. class
* F.Undergrad: Number of full time undergraduates
* P.Undergrad: Number of part time undergraduates
* Outstate: Out-of-state tuition cost
* Room.Board: Room and boarding costs
* Books: Estimated book costs
* Personal: Estimated personal spending
* PhD: Percentage of faculty with Ph.D.’s
* Terminal: Percentage of faculty with terminal degree
* S.F.Ratio: Student to faculty ratio
* perc.alumni: Percentage of alumni who donate
* Expend: Instructional expenditure per student
* Grad.Rate: Graduation rate

In [4]:
df = pd.read_csv('College_Data.csv')

FileNotFoundError: ignored

## Check the data that was read in

In [None]:
df.head()

In [None]:
df.info()

## Are schools with higher out-of-state tuition (Outstate) also the ones that have higher room and boarding costs (Room.Board)?


In [None]:
sns.set_style('whitegrid')
sns.lmplot(data=df, x='Outstate', y='Room.Board', height=6, aspect=2,
           scatter_kws={'s': 50, 'alpha': 0.5}, # 's' is size of scatter points, 'alpha' is the transparency
           line_kws={'color': 'green', 'lw': 2}   # 'color' is color of line, 'lw' is line width
          )

# Set title and labels with enhanced font sizes
plt.title('Relationship between Room & Board Costs and Out-of-State Tuition', fontsize=16)
plt.xlabel('Out-of-State Tuition', fontsize=14)
plt.ylabel('Room & Board Costs', fontsize=14)
plt.show()

## How does the extent of alumni support affect the Graduation rate of students in university?

In [None]:
sns.set_style('whitegrid')
sns.lmplot(data=df, x='perc.alumni', y='Grad.Rate', height=6, aspect=2,
           scatter_kws={'s': 50, 'alpha': 0.5}, # 's' is size of scatter points, 'alpha' is the transparency
           line_kws={'color': 'green', 'lw': 2}   # 'color' is color of line, 'lw' is line width
          )

# Set title and labels with enhanced font sizes
plt.title('Relationship between Alumni Percentage and Graduation Rate', fontsize=16)
plt.xlabel('Alumni Percentage', fontsize=14)
plt.ylabel('Graduation Rate', fontsize=14)
plt.show()

## How does an increasing Student to Faculty member ratio affect the Graduation rate of students in university?

In [None]:
sns.set_style('whitegrid')
sns.lmplot(data=df, x='S.F.Ratio', y='Grad.Rate', height=6, aspect=2,
           scatter_kws={'s': 50, 'alpha': 0.5}, # 's' is size of scatter points, 'alpha' is the transparency
           line_kws={'color': 'red', 'lw': 2}   # 'color' is color of line, 'lw' is line width
          )

# Set title and labels with enhanced font sizes
plt.title('Relationship between Student to Faculty Ratio and Graduation Rate', fontsize=16)
plt.xlabel('S/F Ratio', fontsize=14)
plt.ylabel('Graduation Rate', fontsize=14)
plt.show()

## Notice that we have a graduation rate of >100%? (Not possible)

In [None]:
df.loc[df['Grad.Rate'] > 100, 'Grad.Rate'] = 100

# K Means Cluster Creation

In [None]:
from sklearn.cluster import KMeans

## Analyse Data

In [None]:
df.describe()

#Data is not standardised

## Bring all values to the same magnitude by standardising data

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df)

In [None]:
scaled_df = pd.DataFrame(scaled_df, columns=df.columns)

## See results of standardisation

In [None]:
scaled_df.describe()

## Create a kmeans function and fit the scaled data to it

In [None]:
kmeans = KMeans(n_clusters=4, init="k-means++", n_init=10)
kmeans.fit(scaled_df)
predictions = kmeans.predict(scaled_df)

## Inertia Value

In [None]:
kmeans.inertia_


#Measuring the distance between each data point and its centroid,
#squaring this distance, and summing these squares across the cluster.

#A good model is one with low inertia AND a low number of clusters.
#However, this is a tradeoff because as number of clusters increases, inertia decreases.

## Visualising the Elbow Method (Using inertia values)

In [None]:
#SSE = Sum of Sqaured Error

#Append SSE after each K value is tried on each model
SSE = []
for num_clusters in range(1,20):
    kmeans = KMeans(n_clusters = num_clusters, init='k-means++', n_init=10)
    kmeans.fit(scaled_df)
    SSE.append(kmeans.inertia_)

#Visualising the Elbow Method
frame = pd.DataFrame({'Cluster':range(1,20), 'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['SSE'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

#Use the silhouette score a more concrete way of identifying the best k value

## A more concrete way of finding the best K value

In [None]:
#A more concrete way of finding the best K value

from sklearn.metrics import silhouette_score

k_values = [2, 3, 4, 5, 6, 7, 8]
silhouette_avg = []
for num_clusters in k_values:
    kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10)
    kmeans.fit(scaled_df)
    cluster_labels = kmeans.labels_
    silhouette_avg.append(silhouette_score(scaled_df, cluster_labels))

plt.plot(k_values, silhouette_avg, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Silhouette score')
plt.title('Silhouette analysis For Optimal k')
plt.show()


## Re-fitting the model with our new K value based on the Silhouette Score

In [None]:
kmeans = KMeans(n_clusters=2, init="k-means++", n_init=10)
kmeans.fit(scaled_df)
predictions = kmeans.predict(scaled_df)

# Optional code: Understanding K-means better

As mentioned in this workshop, we have talked about k-means clustering and the metrics we can use to judge k-means better. However, we haven't actually seen its effects on the accuracy of our model!

The code below will show u how optimising the number of clusters based on the Silhouette score and inertia values will improve the models accuracy.

This is purely optional and there is no need to run through this if needed be.


In [None]:
#This dataset has the labels on whether the uni is private or not, note that the data set in real life will not have such labels.
df2 = pd.read_csv('College_Data_unclean.csv')

In [None]:
df2.head()

In [None]:
#converting the Yes and No values in the private column to integers
df2['Private'] = df2['Private'].apply(lambda x: 0 if x=='Yes' else 1)

In [None]:
df2.head()

In [None]:
#Standardise data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_df2 = scaler.fit_transform(df2)

In [None]:
#convert data to dataframe type
scaled_df2 = pd.DataFrame(scaled_df2, columns=df2.columns)

## Accuracy of our initial K value

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, init="k-means++", n_init=10)
kmeans.fit(scaled_df2.drop('Private', axis=1))
predictions = kmeans.predict(scaled_df2.drop('Private', axis=1))

# check how many of the samples were correctly labeled
labels = kmeans.labels_
private_labels = scaled_df2['Private']
correct_labels = sum(private_labels == labels)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, private_labels.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(private_labels.size)))

## Accuracy of our K value after visualising Silhouette score

In [None]:
kmeans = KMeans(n_clusters=2, init='k-means++', n_init=10)
kmeans.fit(scaled_df2.drop('Private', axis=1))
predictions = kmeans.predict(scaled_df2.drop('Private', axis=1))

# check how many of the samples were correctly labeled
labels = kmeans.labels_
private_labels = scaled_df2['Private']
correct_labels = sum(private_labels == labels)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, private_labels.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(private_labels.size)))

In [None]:
## As we can see, an improvement in accuracy after choosing a better K value