In [1]:
import sys
import os
from collections import defaultdict
import numpy as np
import pandas as pd
import re

In [2]:
cs_file = r"data\input\gradcafe_data-master\cs\cs_clean.csv"

df = pd.read_csv(cs_file, header=None)

all_columns = ["rowid", "uni_name", "major", "degree", "season", "decision",
               "decision_method", "decision_date", "decision_timestamp", 
               "ugrad_gpa", "gre_verbal", "gre_quant", "gre_writing", "is_new_gre", "gre_subject", 
               "status", "post_data", "post_timestamp", "comments"]

df.columns = all_columns

In [3]:
columns = ['uni_name', 'major', 'degree', 'season', 'decision', 'ugrad_gpa', 'gre_verbal', 'gre_quant', 
           'gre_writing', 'is_new_gre', 'gre_subject', 'status']

df.head()

Unnamed: 0,rowid,uni_name,major,degree,season,decision,decision_method,decision_date,decision_timestamp,ugrad_gpa,gre_verbal,gre_quant,gre_writing,is_new_gre,gre_subject,status,post_data,post_timestamp,comments
0,0,Tufts University,CS,PhD,S16,Accepted,E-mail,16-11-2015,1447650000.0,3.6,166.0,163.0,4.5,True,,American,16-11-2015,1447650000,Met with professor beforehand. 1 year academic...
1,1,SUNY Stony Brook,CS,MS,S16,Rejected,E-mail,14-11-2015,1447477000.0,3.5,144.0,167.0,3.0,True,,International,14-11-2015,1447477200,non CS background
2,2,Columbia University,CS,MS,S16,Other,Other,14-11-2015,1447477000.0,,,,,,,International,14-11-2015,1447477200,So there was no actual Nov 15 deadline? Or did...
3,3,Columbia University,CS,PhD,S16,Other,Other,13-11-2015,1447391000.0,,,,,,,International,13-11-2015,1447390800,I haven't found the website with the deadline ...
4,4,Columbia University,CS,MS,S16,Other,Other,13-11-2015,1447391000.0,,,,,,,Other,13-11-2015,1447390800,Thanks guys! Good to know I'm not the only one...


In [4]:
df.describe()

Unnamed: 0,rowid,decision_timestamp,ugrad_gpa,gre_verbal,gre_quant,gre_writing,gre_subject,post_timestamp
count,27822.0,27805.0,7222.0,8493.0,8493.0,8248.0,262.0,27822.0
mean,13910.5,1325873000.0,3.698129,288.676439,343.097021,3.863227,800.305344,1326125000.0
std,8031.663931,78625550.0,0.646575,213.479164,277.912011,0.725768,90.468673,78422950.0
min,0.0,401342400.0,0.9,133.0,136.0,2.0,200.0,1139375000.0
25%,6955.25,1267679000.0,3.5,154.0,163.0,3.5,750.0,1267679000.0
50%,13910.5,1331096000.0,3.7,161.0,168.0,4.0,820.0,1331226000.0
75%,20865.75,1394510000.0,3.87,480.0,740.0,4.5,860.0,1394770000.0
max,27821.0,1576386000.0,9.99,800.0,800.0,6.0,990.0,1447650000.0


In [5]:


features = ['ugrad_gpa', 'gre_verbal', 'gre_quant', 'gre_writing']

uni_names = list(set(df["uni_name"]))
uni_names.sort()

# status = list(set(df["uni_name"]))
# status.sort()
status = ['International', 'American', 'International with US Degree']

# decision = list(set(df["uni_name"]))
# decision.sort()
decision = ['Accepted', 'Interview', 'Rejected', 'Wait listed', 'Other']

# season = list(set(df["season"]))
season = ['F09', 'F10', 'S10', 'F11', 'S11',  'F12', 'S12', 'F13', 'S13', 'F14', 'S14', 'F15', 'S15', 'F16', 'S16']

new_gre = True
print_bool = True


major = "CS"
degree = "PhD"

uni_summary = {}

tmp_df = df

i=0
for uni in uni_names: 
    uni_summary[uni] = {}    
    
    if print_bool and i%10==0:
        print("{}:{}".format(i+1, uni))
        print("-"*110)
        
    
    for sn in season:
        uni_summary[uni][sn] = {}
        
        for d in decision:
            uni_summary[uni][sn][d] = {}
            
            for st in status:
                uni_summary[uni][sn][d][st] = {}
                
                mean_ugrad_gpa = []
                mean_gre_verbal = []
                mean_gre_quant = []
                mean_gre_writing = []

                std_ugrad_gpa = []
                std_gre_verbal = []
                std_gre_quant = []
                std_gre_writing = []
                
                try:
                    tmp_cur_df = tmp_df[(tmp_df["major"]==major) \
                                        & (tmp_df["degree"]==degree) \
                                        & (tmp_df["uni_name"]==uni) \
                                        & (tmp_df["season"]==sn) \
                                        & (tmp_df["decision"]==d) \
                                        & (tmp_df["status"]==st)
                                        # & (tmp_df["is_new_gre"]==new_gre)
                                       ]

                    if len(tmp_cur_df)>0:
                        uni_summary[uni][sn][d][st]["n_instances"] = len(tmp_cur_df)
                        uni_summary[uni][sn][d][st]["mean_ugrad_gpa"] = tmp_cur_df.ugrad_gpa.mean()
                        uni_summary[uni][sn][d][st]["mean_gre_verbal"] = tmp_cur_df.gre_verbal.mean()
                        uni_summary[uni][sn][d][st]["mean_gre_quant"] = tmp_cur_df.gre_quant.mean()
                        uni_summary[uni][sn][d][st]["mean_gre_writing"] = tmp_cur_df.gre_writing.mean()

                        uni_summary[uni][sn][d][st]["std_ugrad_gpa"] = tmp_cur_df.ugrad_gpa.std()
                        uni_summary[uni][sn][d][st]["std_gre_verbal"] = tmp_cur_df.gre_verbal.std()
                        uni_summary[uni][sn][d][st]["std_gre_quant"] = tmp_cur_df.gre_quant.std()
                        uni_summary[uni][sn][d][st]["std_gre_writing"] = tmp_cur_df.gre_writing.std()

                        if print_bool and i%10==0 and uni_summary[uni][sn][d][st]["n_instances"]>0:
                            print("season: {}, decision: {}, status: {}".format(sn, d, st)) 
                            print("mean_ugrad_gpa: {0:.2f}(+-{1:.2f}), mean_gre_verbal: {2:.2f}(+-{3:.2f}), mean_gre_quant: {4:.2f}(+-{5:.2f}), mean_gre_writing: {6:.2f}(+-{7:.2f})".format(uni_summary[uni, sn, d, st, "mean_ugrad_gpa"], uni_summary[uni, sn, d, st, "std_ugrad_gpa"], uni_summary[uni, sn, d, st, "mean_gre_verbal"], uni_summary[uni, sn, d, st, "std_gre_verbal"], uni_summary[uni, sn, d, st, "mean_gre_quant"], uni_summary[uni, sn, d, st, "std_gre_quant"], uni_summary[uni, sn, d, st, "mean_gre_writing"], uni_summary[uni, sn, d, st, "std_gre_writing"]))
                            print("mean_ugrad_gpa: {}, mean_gre_verbal:{}, mean_gre_quant:{}, mean_gre_writing: {}".format(uni_summary[uni, sn, d, st, "mean_ugrad_gpa"], uni_summary[uni, sn, d, st, "mean_gre_verbal"], uni_summary[uni, sn, d, st, "mean_gre_quant"], uni_summary[uni, sn, d, st, "mean_gre_writing"]))
                            print("-"*110)
                except:
                    print("error:", i, uni, sn, d, st)
    i+=1

1:ABC
--------------------------------------------------------------------------------------------------------------
error: 5 Aligarh Muslim University S13 Interview International
11:Arizona State University (ASU)
--------------------------------------------------------------------------------------------------------------
season: F10, decision: Accepted, status: International
error: 10 Arizona State University (ASU) F10 Accepted International
season: F10, decision: Accepted, status: International with US Degree
error: 10 Arizona State University (ASU) F10 Accepted International with US Degree
season: F10, decision: Rejected, status: International
error: 10 Arizona State University (ASU) F10 Rejected International
season: F11, decision: Accepted, status: International
error: 10 Arizona State University (ASU) F11 Accepted International
season: F11, decision: Accepted, status: International with US Degree
error: 10 Arizona State University (ASU) F11 Accepted International with US Degree


season: F13, decision: Other, status: International with US Degree
error: 60 Duke University F13 Other International with US Degree
season: F14, decision: Accepted, status: International
error: 60 Duke University F14 Accepted International
season: F14, decision: Interview, status: American
error: 60 Duke University F14 Interview American
season: F14, decision: Interview, status: International with US Degree
error: 60 Duke University F14 Interview International with US Degree
season: F14, decision: Rejected, status: International
error: 60 Duke University F14 Rejected International
season: F14, decision: Rejected, status: American
error: 60 Duke University F14 Rejected American
season: F14, decision: Rejected, status: International with US Degree
error: 60 Duke University F14 Rejected International with US Degree
season: F14, decision: Other, status: International
error: 60 Duke University F14 Other International
season: F14, decision: Other, status: International with US Degree
error: 

season: F14, decision: Accepted, status: International
error: 160 New York University (NYU) F14 Accepted International
season: F14, decision: Accepted, status: American
error: 160 New York University (NYU) F14 Accepted American
season: F14, decision: Interview, status: International
error: 160 New York University (NYU) F14 Interview International
season: F14, decision: Interview, status: American
error: 160 New York University (NYU) F14 Interview American
season: F14, decision: Rejected, status: International
error: 160 New York University (NYU) F14 Rejected International
season: F14, decision: Rejected, status: American
error: 160 New York University (NYU) F14 Rejected American
season: F14, decision: Rejected, status: International with US Degree
error: 160 New York University (NYU) F14 Rejected International with US Degree
season: F15, decision: Accepted, status: International
error: 160 New York University (NYU) F15 Accepted International
season: F15, decision: Accepted, status: Ame

season: F12, decision: Accepted, status: International
error: 270 University Of California, Irvine (UCI) F12 Accepted International
season: F12, decision: Accepted, status: International with US Degree
error: 270 University Of California, Irvine (UCI) F12 Accepted International with US Degree
season: F12, decision: Interview, status: International
error: 270 University Of California, Irvine (UCI) F12 Interview International
season: F12, decision: Rejected, status: International
error: 270 University Of California, Irvine (UCI) F12 Rejected International
season: F12, decision: Rejected, status: American
error: 270 University Of California, Irvine (UCI) F12 Rejected American
season: F12, decision: Rejected, status: International with US Degree
error: 270 University Of California, Irvine (UCI) F12 Rejected International with US Degree
season: F12, decision: Other, status: International
error: 270 University Of California, Irvine (UCI) F12 Other International
season: F12, decision: Other, 

291:University Of Dhaka
--------------------------------------------------------------------------------------------------------------
301:University Of Illinois, Chicago
--------------------------------------------------------------------------------------------------------------
311:University Of Louisville
--------------------------------------------------------------------------------------------------------------
season: F12, decision: Accepted, status: International with US Degree
error: 310 University Of Louisville F12 Accepted International with US Degree
season: F14, decision: Accepted, status: International
error: 310 University Of Louisville F14 Accepted International
season: F15, decision: Accepted, status: International
error: 310 University Of Louisville F15 Accepted International
321:University Of Memphis
--------------------------------------------------------------------------------------------------------------
season: F11, decision: Accepted, status: International
er

season: F12, decision: Accepted, status: International
error: 360 University Of South Carolina F12 Accepted International
season: F12, decision: Other, status: International
error: 360 University Of South Carolina F12 Other International
season: F13, decision: Accepted, status: International
error: 360 University Of South Carolina F13 Accepted International
season: F14, decision: Accepted, status: International
error: 360 University Of South Carolina F14 Accepted International
season: F15, decision: Accepted, status: International
error: 360 University Of South Carolina F15 Accepted International
371:University Of Texas, Dallas (UT Dallas)
--------------------------------------------------------------------------------------------------------------
season: F10, decision: Accepted, status: International
error: 370 University Of Texas, Dallas (UT Dallas) F10 Accepted International
season: F10, decision: Accepted, status: International with US Degree
error: 370 University Of Texas, Dallas

season: F15, decision: Other, status: American
error: 380 University Of Washington, Seattle (UW) F15 Other American
season: F15, decision: Other, status: International with US Degree
error: 380 University Of Washington, Seattle (UW) F15 Other International with US Degree
391:University of Pittsburgh
--------------------------------------------------------------------------------------------------------------
season: F12, decision: Wait listed, status: International
error: 390 University of Pittsburgh F12 Wait listed International
401:Waltersun Memorial Institute Of Technology
--------------------------------------------------------------------------------------------------------------
season: F15, decision: Accepted, status: International
error: 400 Waltersun Memorial Institute Of Technology F15 Accepted International
411:Wright State University
--------------------------------------------------------------------------------------------------------------
season: F12, decision: Accepted

In [6]:
import json
import pickle
    
    
outdir = "data/output/" 
uni_summary_file = outdir+"uni_cs_phd_summary_clean.p"

with open(uni_summary_file, 'wb') as fp:
    pickle.dump(uni_summary, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
def check_key(uni_summary, uni, season, decision, status, item):
    if uni in uni_summary:
        if season in uni_summary[uni]:
            if decision in uni_summary[uni][season]:
                if status in uni_summary[uni][season][decision]:
                    if item in uni_summary[uni][season][decision][status]:
                        return True
    return False

In [49]:
i=0
decision = 'Accepted'
status = 'International with US Degree'
uni_avg = {}
for uni in uni_summary:
    uni_avg[uni] = {}
    avg_mean_ugrad_gpa = 0
    avg_mean_gre_verbal = 0
    avg_mean_gre_quant = 0
    avg_mean_gre_writing = 0
    
    count_mean_ugrad_gpa = 0
    count_mean_gre_verbal = 0
    count_mean_gre_quant = 0
    count_mean_gre_writing = 0
    
    for season in uni_summary[uni]:
        if check_key(uni_summary, uni, season, decision, status, 'mean_ugrad_gpa'):
            mean_ugrad_gpa = uni_summary[uni][season][decision][status]['mean_ugrad_gpa']
            if mean_ugrad_gpa>=0 and mean_ugrad_gpa<=4.0:
                count_mean_ugrad_gpa += 1
                avg_mean_ugrad_gpa += mean_ugrad_gpa
                # print(mean_ugrad_gpa)
        
        if check_key(uni_summary, uni, season, decision, status, 'mean_gre_verbal'):
            mean_gre_verbal = uni_summary[uni][season][decision][status]['mean_gre_verbal']
            if mean_gre_verbal>=130 and mean_gre_verbal<=170:
                count_mean_gre_verbal += 1
                avg_mean_gre_verbal += mean_gre_verbal
                print(mean_gre_verbal)
                
         
        if check_key(uni_summary, uni, season, decision, status, 'mean_gre_quant'):
            mean_gre_quant = uni_summary[uni][season][decision][status]['mean_gre_quant']
            
            if mean_gre_quant>=130 and mean_gre_quant<=170:
                count_mean_gre_quant += 1
                avg_mean_gre_quant += mean_gre_quant
                print(mean_gre_quant)

        
        if check_key(uni_summary, uni, season, decision, status, 'mean_gre_writing'):
            mean_gre_writing = uni_summary[uni][season][decision][status]['mean_gre_writing']
            if mean_gre_writing>=0 and mean_gre_writing<=6:
                count_mean_gre_writing += 1
                avg_mean_gre_writing += mean_gre_writing
                print(mean_gre_writing)
     
    uni_avg[uni]["avg_mean_ugrad_gpa"] = -1
    uni_avg[uni]["avg_mean_gre_verbal"] = -1
    uni_avg[uni]["avg_mean_gre_quant"] = -1
    uni_avg[uni]["avg_mean_gre_writing"] = -1
    
    if count_mean_ugrad_gpa>0:
        uni_avg[uni]["avg_mean_ugrad_gpa"] = avg_mean_ugrad_gpa/count_mean_ugrad_gpa
    
    if count_mean_gre_verbal>0:
        uni_avg[uni]["avg_mean_gre_verbal"] = avg_mean_gre_verbal/count_mean_gre_verbal
    
    if count_mean_gre_quant>0:
        uni_avg[uni]["avg_mean_gre_quant"] = avg_mean_gre_quant/count_mean_gre_quant
        
    if count_mean_gre_writing>0:
        uni_avg[uni]["avg_mean_gre_writing"] = avg_mean_gre_writing/count_mean_gre_writing
    
    i += 1
    
print(i)

4.5
4.166666666666667
166.0
158.0
3.5
159.0
170.0
4.0
5.0
165.0
170.0
4.5
4.25
4.5
164.0
170.0
5.0
150.0
170.0
3.0
3.0
5.0
3.0
160.0
145.0
3.0
4.5
4.375
4.0
145.0
157.0
3.0
3.5
165.0
170.0
3.5
4.0
4.25
4.5
164.0
170.0
4.5
143.0
160.0
3.5
3.0
3.5
3.0
154.0
163.0
4.0
4.5
4.0
4.0
3.5
160.5
170.0
4.0
153.0
167.0
4.0
5.0
3.0
145.0
158.0
3.0
146.0
169.0
3.5
3.0
154.0
163.0
150.0
165.0
3.0
5.25
4.666666666666667
150.0
170.0
3.0
4.0
3.0
156.0
170.0
4.0
4.0
4.0
160.0
170.0
4.5
147.0
159.0
4.0
151.0
168.0
4.5
3.5
3.5
154.0
160.0
3.0
144.0
163.0
3.5
5.5
3.75
3.0
141.0
156.0
3.0
158.66666666666666
164.0
3.5
157.0
167.0
3.5
163.0
153.0
4.0
4.5
152.0
164.0
4.5
5.5
162.0
170.0
4.0
150.0
170.0
3.0
157.0
167.0
4.0
4.0
3.0
4.5
159.0
162.0
4.5
154.0
160.0
3.0
151.0
165.0
3.5
5.5
163.0
170.0
5.0
5.0
155.0
168.5
4.0
4.5
3.0
160.0
168.5
4.0
3.0
152.0
162.0
3.5
167.0
158.0
3.0
5.5
4.5
164.0
170.0
4.0
170.0
155.0
4.0
155.0
163.0
3.5
170.0
160.0
4.5
157.0
169.0
3.5
163.0
170.0
4.5
3.0
3.0
4.5
3.0
5.0
4.25
151.

In [52]:
uni_avg_file = outdir+"uni_cs_phd_avg_clean.p"

with open(uni_summary_file, 'wb') as fp:
    pickle.dump(uni_avg, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [81]:
my_gpa = 3.80
my_gre_verbal = 151
my_gre_quant = 162
my_gre_writting = 3

my_gpa_uni = []
my_gre_verbal_uni = []
my_gre_quant_uni = []
my_gre_writting_uni = []

for uni in uni_avg:
    if uni_avg[uni]["avg_mean_ugrad_gpa"]!=-1 and uni_avg[uni]["avg_mean_ugrad_gpa"]<=my_gpa:
        my_gpa_uni.append(uni)
#         print(uni)
    if uni_avg[uni]["avg_mean_gre_verbal"]!=-1 and uni_avg[uni]["avg_mean_gre_verbal"]<=my_gre_verbal:
        my_gre_verbal_uni.append(uni)
#         print(uni)
    if uni_avg[uni]["avg_mean_gre_quant"]!=-1 and uni_avg[uni]["avg_mean_gre_quant"]<=my_gre_quant:
        my_gre_quant_uni.append(uni)
#         print(uni)
    if uni_avg[uni]["avg_mean_gre_writing"]!=-1 and uni_avg[uni]["avg_mean_gre_writing"]<=my_gre_writting:
        my_gre_writting_uni.append(uni)
#         print(uni)

u = set.intersection(set(my_gpa_uni), set(my_gre_verbal_uni), set(my_gre_quant_uni), set(my_gre_writting_uni))

for i in u:
    print(i)

Rutgers University
