In [None]:
import sys
import os
from collections import defaultdict
import numpy as np
import pandas as pd
import re

In [None]:
cs_file = r"data\input\gradcafe_data-master\cs\cs_clean.csv"

df = pd.read_csv(cs_file, header=None)

all_columns = ["rowid", "uni_name", "major", "degree", "season", "decision",
               "decision_method", "decision_date", "decision_timestamp", 
               "ugrad_gpa", "gre_verbal", "gre_quant", "gre_writing", "is_new_gre", "gre_subject", 
               "status", "post_data", "post_timestamp", "comments"]

df.columns = all_columns

In [None]:
columns = ['uni_name', 'major', 'degree', 'season', 'decision', 'ugrad_gpa', 'gre_verbal', 'gre_quant', 
           'gre_writing', 'is_new_gre', 'gre_subject', 'status']

df.head()

In [None]:
df.describe()

In [None]:
def print_admission_stat(tmp):
    print("Undergrad GPA")
    print("-"*30)
    print("Min: {}".format(tmp.ugrad_gpa.min()))
    print("Max: {}".format(tmp.ugrad_gpa.max()))
    print("25%: {}".format(tmp.ugrad_gpa.quantile(0.25)))
    print("50%: {}".format(tmp.ugrad_gpa.quantile(0.5)))
    print("75%: {}".format(tmp.ugrad_gpa.quantile(0.75)))
    print("Mean: {}".format(tmp.ugrad_gpa.mean()))
    print("Std: {}".format(tmp.ugrad_gpa.std()))

    print("\nGRE Verbal")
    print("-"*30)
    print("Min: {}".format(tmp.gre_verbal.min()))
    print("Max: {}".format(tmp.gre_verbal.max()))
    print("25%: {}".format(tmp.gre_verbal.quantile(0.25)))
    print("50%: {}".format(tmp.gre_verbal.quantile(0.5)))
    print("75%: {}".format(tmp.gre_verbal.quantile(0.75)))
    print("Mean: {}".format(tmp.gre_verbal.mean()))
    print("Std: {}".format(tmp.gre_verbal.std()))

    print("\nGRE Quant")
    print("-"*30)
    print("Min: {}".format(tmp.gre_quant.min()))
    print("Max: {}".format(tmp.gre_quant.max()))
    print("25%: {}".format(tmp.gre_quant.quantile(0.25)))
    print("50%: {}".format(tmp.gre_quant.quantile(0.50)))
    print("75%: {}".format(tmp.gre_quant.quantile(0.75)))
    print("Mean: {}".format(tmp.gre_quant.mean()))
    print("Std: {}".format(tmp.gre_quant.std()))

    print("\nGRE Writing")
    print("-"*30)
    print("Min: {}".format(tmp.gre_writing.min()))
    print("Max: {}".format(tmp.gre_writing.max()))
    print("25%: {}".format(tmp.gre_writing.quantile(0.25)))
    print("50%: {}".format(tmp.gre_writing.quantile(0.50)))
    print("75%: {}".format(tmp.gre_writing.quantile(0.75)))
    print("Mean: {}".format(tmp.gre_writing.mean()))
    print("Std: {}".format(tmp.gre_writing.std()))

In [None]:
def admission_stat_p(tmp, my_ugrad_gpa, my_gre_verbal, my_gre_quant, my_gre_writing, x):
    ugrad_gpa_min = tmp.ugrad_gpa.min()
    ugrad_gpa_max = tmp.ugrad_gpa.max()
    ugrad_gpa_x = tmp.ugrad_gpa.quantile(x/100)
    ugrad_gpa_25 = tmp.ugrad_gpa.quantile(0.25)
    ugrad_gpa_50 = tmp.ugrad_gpa.quantile(0.5)
    ugrad_gpa_75 = tmp.ugrad_gpa.quantile(0.75)
    ugrad_gpa_mean = tmp.ugrad_gpa.mean()
    ugrad_gpa_std = tmp.ugrad_gpa.std()


    gre_verbal_min = tmp.gre_verbal.min()
    gre_verbal_max = tmp.gre_verbal.max()
    gre_verbal_x = tmp.gre_verbal.quantile(x/100)
    gre_verbal_25 = tmp.gre_verbal.quantile(0.25)
    gre_verbal_50 = tmp.gre_verbal.quantile(0.5)
    gre_verbal_75 = tmp.gre_verbal.quantile(0.75)
    gre_verbal_mean = tmp.gre_verbal.mean()
    gre_verbal_std = tmp.gre_verbal.std()


    gre_quant_min = tmp.gre_quant.min()
    gre_quant_max = tmp.gre_quant.max()
    gre_quant_x = tmp.gre_quant.quantile(x/100)
    gre_quant_25 = tmp.gre_quant.quantile(0.25)
    gre_quant_50 = tmp.gre_quant.quantile(0.50)
    gre_quant_75 = tmp.gre_quant.quantile(0.75)
    gre_quant_mean = tmp.gre_quant.mean()
    gre_quant_std = tmp.gre_quant.std()

    gre_writing_min = tmp.gre_writing.min()
    gre_writing_max = tmp.gre_writing.max()
    gre_writing_x = tmp.gre_writing.quantile(x/100)
    gre_writing_25 = tmp.gre_writing.quantile(0.25)
    gre_writing_50 = tmp.gre_writing.quantile(0.50)
    gre_writing_75 = tmp.gre_writing.quantile(0.75)
    gre_writing_mean = tmp.gre_writing.mean()
    gre_writing_std = tmp.gre_writing.std()

        
        
#     if (my_ugrad_gpa>=ugrad_gpa_x) \
#     and (my_gre_verbal>=gre_verbal_x) \
#     and (my_gre_quant>=gre_quant_x) \
#     and (my_gre_writing>=gre_writing_x):
#         print("Above ", x, "%")

    if (my_ugrad_gpa>=ugrad_gpa_75) \
    and (my_gre_verbal>=gre_verbal_75) \
    and (my_gre_quant>=gre_quant_75) \
    and (my_gre_writing>=gre_writing_75):
        # print("Above 75%")
        return 75

        
    if (my_ugrad_gpa>=ugrad_gpa_50) \
    and (my_gre_verbal>=gre_verbal_50) \
    and (my_gre_quant>=gre_quant_50) \
    and (my_gre_writing>=gre_writing_50):
        # print("Above 50%")
        return 50
    
    if (my_ugrad_gpa>=ugrad_gpa_mean) \
    and (my_gre_verbal>=gre_verbal_mean) \
    and (my_gre_quant>=gre_quant_mean) \
    and (my_gre_writing>=gre_writing_mean):
#         print("Above Average")
        return 50.0
        
    if (my_ugrad_gpa>=ugrad_gpa_25) \
    and (my_gre_verbal>=gre_verbal_25) \
    and (my_gre_quant>=gre_quant_25) \
    and (my_gre_writing>=gre_writing_25):
        # print("Above 25%")
        return 25
    

        
    if (my_ugrad_gpa>=ugrad_gpa_min) \
    and (my_gre_verbal>=gre_verbal_min) \
    and (my_gre_quant>=gre_quant_min) \
    and (my_gre_writing>=gre_writing_min):
#         print("Above minimum")
        return 1
#     else:
#         print("not above min")
    return 0

In [None]:
from scipy import stats


def admission_stat_percent(tmp, my_ugrad_gpa, my_gre_verbal, my_gre_quant, my_gre_writing):
    
    ugrad_gpa_min = tmp.ugrad_gpa.min()
    ugrad_gpa_max = tmp.ugrad_gpa.max()
    ugrad_gpa_x = stats.percentileofscore(tmp.ugrad_gpa, my_ugrad_gpa)
    ugrad_gpa_mean = tmp.ugrad_gpa.mean()
    ugrad_gpa_std = tmp.ugrad_gpa.std()


    gre_verbal_min = tmp.gre_verbal.min()
    gre_verbal_max = tmp.gre_verbal.max()
    gre_verbal_x = stats.percentileofscore(tmp.gre_verbal, my_gre_verbal)
    gre_verbal_mean = tmp.gre_verbal.mean()
    gre_verbal_std = tmp.gre_verbal.std()


    gre_quant_min = tmp.gre_quant.min()
    gre_quant_max = tmp.gre_quant.max()
    gre_quant_x = stats.percentileofscore(tmp.gre_quant, my_gre_quant)
    gre_quant_mean = tmp.gre_quant.mean()
    gre_quant_std = tmp.gre_quant.std()

    gre_writing_min = tmp.gre_writing.min()
    gre_writing_max = tmp.gre_writing.max()
    gre_writing_x = stats.percentileofscore(tmp.gre_writing, my_gre_writing)
    gre_writing_mean = tmp.gre_writing.mean()
    gre_writing_std = tmp.gre_writing.std()
    
    
    return ugrad_gpa_x, gre_verbal_x, gre_quant_x, gre_writing_x

In [None]:
status_list = ['American', 'International', 'International with US Degree', 'Other', np.nan]
decision_list = ['Accepted', 'Interview', 'Other', 'Rejected', 'Wait listed', np.nan]
uni_names = list(set(df.uni_name))
uni_names.sort()

In [None]:
# uni_name = "Stanford University"
major = "CS"
degree = "PhD"
decision = "Accepted"
is_new_gre = True
status1 = 'International'
status2 = 'International with US Degree'

season = "F15"



my_ugrad_gpa = 3.80
my_gre_verbal = 152
my_gre_quant = 162
my_gre_writing = 3.0

In [None]:
details = True
 uni_match = {}

for uni in uni_names:
    tmp=df[(df["uni_name"]==uni)
          & (df["major"]==major)
          & (df["degree"]==degree)
          & (df["decision"]==decision)
          & (df["is_new_gre"]==is_new_gre)
#           & ((df["status"]==status1) | (df["status"]==status2))
          & (df["season"]==season)
          ]
    tmp=tmp[(tmp["ugrad_gpa"]>=0)
          & (tmp["ugrad_gpa"]<=4.0)
          & (tmp["gre_verbal"]>=130)
          & (tmp["gre_verbal"]<=170)
          & (tmp["gre_quant"]>=130)
          & (tmp["gre_quant"]<=170)
          & (tmp["gre_writing"]>=0.0)
          & (tmp["gre_writing"]<=6.0)
          ]
    

    ugrad_gpa_x, gre_verbal_x, gre_quant_x, gre_writing_x = admission_stat_percent(tmp, my_ugrad_gpa, my_gre_verbal, my_gre_quant, my_gre_writing)
    
    
    ugrad_gpa_cut_off = 10
    gre_verbal_cut_off = 5
    gre_quant_cut_off = 10
    gre_writing_cut_off = 5
    
    
    if len(tmp)>0 \
    and (ugrad_gpa_x>=ugrad_gpa_cut_off and ugrad_gpa_x<=100) \
    and (gre_verbal_x>=gre_verbal_cut_off and gre_verbal_x<=100) \
    and (gre_quant_x>=gre_quant_cut_off and gre_quant_x<=100) \
    and (gre_writing_x>=gre_writing_cut_off and gre_writing_x<=100):
#         print(uni)
        if details and len(tmp)>5:
            print(uni)
            print("-"*50)
            print("Count          : {}      ".format(len(tmp)))
            print("-"*50)
            print("Undergrad GPA  : {0:.2f}%".format(ugrad_gpa_x))
            print("GRE Verbal     : {0:.2f}%".format(gre_verbal_x))
            print("GRE Quant      : {0:.2f}%".format(gre_quant_x))
            print("GRE Writing    : {0:.2f}%".format(gre_writing_x))
            print("-"*50)
            print("\n\n")
    

In [None]:
def get_match(df, uni_names, my_ugrad_gpa, my_gre_verbal, my_gre_quant, my_gre_writing):
    uni_match = {}
    for uni in uni_names:
        tmp = df[(df["uni_name"]==uni)]
        
        tmp = tmp[(tmp["ugrad_gpa"]>=0.0)
                  & (tmp["ugrad_gpa"]<=4.0)
                  & (tmp["gre_verbal"]>=130)
                  & (tmp["gre_verbal"]<=170)
                  & (tmp["gre_quant"]>=130)
                  & (tmp["gre_quant"]<=170)
                  & (tmp["gre_writing"]>=0.0)
                  & (tmp["gre_writing"]<=6.0)
                  ]
    
        ugrad_gpa_x, gre_verbal_x, gre_quant_x, gre_writing_x = admission_stat_percent(tmp, my_ugrad_gpa, my_gre_verbal, my_gre_quant, my_gre_writing)

        if len(tmp)>0:
            uni_match[uni] = (ugrad_gpa_x + gre_verbal_x + gre_quant_x + gre_writing_x)/4

        uni_match = sorted(uni_match.items(), key=lambda kv: kv[1], reverse=True)
        uni_match = dict(uni_match)
    
    return uni_match

In [None]:
tmp = df[(df["major"]==major)
         & (df["degree"]==degree)
         & (df["decision"]==decision)
         & (df["is_new_gre"]==is_new_gre)
         & ((df["status"]==status1) | (df["status"]==status2))
         & (df["season"]==season)
        ]

In [None]:
uni_match = get_match(tmp, uni_names, my_ugrad_gpa, my_gre_verbal, my_gre_quant, my_gre_writing)

In [None]:
count = 100
for uni in uni_match:
#     if count<=0:
#         break
    print("{} : {}".format(uni, uni_match[uni]/4))
    count -= 1