In [5]:
# Imports
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [6]:
# Read in the data
college_data = pd.read_csv("data/cc_institution_details.csv")
job_data = pd.read_csv("https://raw.githubusercontent.com/DG1606/CMS-R-2020/master/Placement_Data_Full_Class.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/cc_institution_details.csv'

### Dataset 1, Step 1

In [None]:
college_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 63 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   index                                 3798 non-null   int64  
 1   unitid                                3798 non-null   int64  
 2   chronname                             3798 non-null   object 
 3   city                                  3798 non-null   object 
 4   state                                 3798 non-null   object 
 5   level                                 3798 non-null   object 
 6   control                               3798 non-null   object 
 7   basic                                 3798 non-null   object 
 8   hbcu                                  94 non-null     object 
 9   flagship                              50 non-null     object 
 10  long_x                                3798 non-null   float64
 11  lat_y            

#### Problems: 
This dataset shares information about many colleges, and there are many columns included. We have metrics on class size, graduation percentage, financial information, test scores, and many others. However, a lot of the data is missing. Students may care about how much money they can receive in awards but not have access to this information. The problem is students not being able to see much award money is awarded and needing to see what information can show how much money schools award.

#### Question:
Can we predict how much money schools spend on awards from public information?

### Dataset 1, Step 2
- Our independent business metric is the amount of money that schools spend on awards
- Taking a look at what information we can easily acquire, we can start with level, private vs. public, student count, median SAT score, graduation percentage, and % of full time students for our predictors. Our target variable is money spent on awards, so we will look at the awards_per_value and exp_award_value, the amount of awards given out per 100 undergraduates.

### Data Preparation

In [None]:
# Creating a new dataset that includes only the variables of interest
college_data_filtered = college_data[['chronname', 'level', 'control', 'student_count', 'med_sat_value', 'grad_100_percentile',
                                       'ft_pct', 'exp_award_value', 'awards_per_value']]


In [None]:
# Checking structure of our dataset to see if there are any issues
college_data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   chronname            3798 non-null   object 
 1   level                3798 non-null   object 
 2   control              3798 non-null   object 
 3   student_count        3798 non-null   int64  
 4   med_sat_value        1337 non-null   float64
 5   grad_100_percentile  3467 non-null   float64
 6   ft_pct               3794 non-null   float64
 7   exp_award_value      3798 non-null   int64  
 8   awards_per_value     3798 non-null   float64
dtypes: float64(4), int64(2), object(3)
memory usage: 267.2+ KB


In [None]:
# Checking how many unique values we have for certain variables that may be categorical
print("Level classifications:", college_data_filtered['level'].unique())
print("Control classifications:", college_data_filtered['control'].unique())

Level classifications: ['4-year' '2-year']
Control classifications: ['Public' 'Private not-for-profit' 'Private for-profit']


Great. There are only 4-year and 2-year universities, and only public, private not-for profit and private for-profit.
We definitely want to make these categorical variables. 
We could divide into just public and private, but let's keep further classifications because there are only three

In [None]:
# Making level and control into categorical variables
college_data_filtered['level'] = college_data_filtered['level'].astype('category')
college_data_filtered['control'] = college_data_filtered['control'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  college_data_filtered['level'] = college_data_filtered['level'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  college_data_filtered['control'] = college_data_filtered['control'].astype('category')


All of our other variables have distinct values. However, we can collapse to narrow down into broad categories.
Specifically, we can divide count of students into levels 1-4, with one being extremely small schools(bottom 25th %), and 4 being very large(top 25%)

In [None]:
# Boxplot to visualize distribution of size of schools
college_data_filtered.boxplot(column= 'student_count', vert= False, grid=False)

<Axes: >

Seems like we have a lot of outliers. May be better to just print the percentiles

In [None]:
count_pct = np.percentile(college_data_filtered['student_count'], [25, 50, 75])
count_25 =  count_pct[0]
count_50 =  count_pct[1]
count_75 =  count_pct[2]

In [None]:
# Loop through colleges to assign values based on size, make categorical variable
for index, row in college_data_filtered.iterrows():
    if row['student_count'] < count_pct[0]:
        college_data_filtered.at[index, 'student_count'] = 1
    elif row['student_count'] < count_pct[1]:
        college_data_filtered.at[index, 'student_count'] = 2
    elif row['student_count'] < count_pct[2]:
        college_data_filtered.at[index, 'student_count'] = 3
    else:
        college_data_filtered.at[index, 'student_count'] = 4

college_data_filtered['student_count'] = college_data_filtered['student_count'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  college_data_filtered['student_count'] = college_data_filtered['student_count'].astype('category')


The next step will be to implement one-hot encoding on our categorical variables, making them into binary variables

In [None]:
category_list = list(college_data_filtered.select_dtypes('category')) # select function to find the categorical variables and create a list
print(category_list)
college_data_filtered_1h = pd.get_dummies(college_data_filtered, columns=category_list)
college_data_filtered_1h



['level', 'control', 'student_count']


Unnamed: 0,chronname,med_sat_value,grad_100_percentile,ft_pct,exp_award_value,awards_per_value,level_2-year,level_4-year,control_Private for-profit,control_Private not-for-profit,control_Public,student_count_1,student_count_2,student_count_3,student_count_4
0,Alabama A&M University,823.0,15.0,93.8,105331,14.2,False,True,False,False,True,False,False,True,False
1,University of Alabama at Birmingham,1146.0,67.0,72.7,136546,20.9,False,True,False,False,True,False,False,False,True
2,Amridge University,,0.0,62.7,58414,29.9,False,True,False,True,False,True,False,False,False
3,University of Alabama at Huntsville,1180.0,34.0,74.4,64418,20.9,False,True,False,False,True,False,False,False,True
4,Alabama State University,830.0,11.0,91.0,132407,11.6,False,True,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3793,Grace College of Divinity,,0.0,34.5,24047,26.2,False,True,False,True,False,True,False,False,False
3794,John Paul the Great Catholic University,1069.0,64.0,92.1,105965,16.6,False,True,False,True,False,True,False,False,False
3795,Chamberlain College of Nursing-Missouri,,82.0,57.0,52936,55.1,False,True,True,False,False,True,False,False,False
3796,Minneapolis Media Institute,,54.0,70.1,74857,32.8,True,False,True,False,False,True,False,False,False


In [None]:
job_data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


Is employability percentage a good predictor of job status?