In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from pandas.core import datetools
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from scipy import stats
from sklearn.model_selection import train_test_split as tt_split
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
import csv

  from pandas.core import datetools


In [2]:
def read_dataset(path):
    data = pd.read_csv(path)
    return data

In [3]:
senate_name_dict = {}
house_name_dict = {}
name_dict = {}
def enumerate_districts(df, type=None):
    names = df['District']
    count = 0
    for name in names:
        if type == "Senate":
            if name not in senate_name_dict:
                senate_name_dict[name] = count
                count += 1
        elif type == "House":
            if name not in house_name_dict:
                house_name_dict[name] = count
                count += 1
        else:
            if name not in name_dict:
                name_dict[name] = count
                count += 1

In [4]:
def replace_district(x, type=None):
    if type == "Senate":
        return senate_name_dict[x]
    elif type == "House":
        return house_name_dict[x]
    else:
        return name_dict[x]

In [5]:
# this will be different than historical_model
def clean_data(df, type=None):
    # make all text binary
    df = df.drop("name", axis=1)
    df = df.drop("Unnamed: 0", axis=1)
    df['sex'].replace('f', 1, inplace=True)
    df['sex'].replace('m', 0, inplace=True)
    df['party'].replace('Democratic', 1, inplace=True)
    df['party'].replace('Republican', 0, inplace=True)
    
    # fill NaN's with mean from column
    df['sex'] = df['sex'].fillna(round(df['sex'].mean()))
    df['party'] = df['party'].fillna(df['party'].mean())
    df['Amount'] = df['Amount'].fillna(df['Amount'].mean())
    df['District'] = df["District"].apply(lambda x: replace_district(x, type))
    
    for col in df.columns:
        if col == "vote_count":
            continue
        if df[col].dtype != float and df[col].dtype != int:
            length = len(list(df[col]))
            temp_list = list(df[col])
            for i in range(length):
                
                if "," in str(temp_list[i]):
                    df[col] = df[col].apply(lambda x: str(x).replace(",", "").replace('nan', 'NaN')).astype(float)
    
    df['vote_count'] = df['vote_count'].apply(lambda x: str(x).replace(",", "").replace('nan', 'NaN')).astype(float)
    df['vote_count'] = df['vote_count'].fillna(df['vote_count'].mean())
    df['vote_percent'] = df['vote_percent'].fillna(df['vote_percent'].mean())
    #df['District'] = df["District"].apply(lambda x: replace_district(x, type))
    
    # add indicator for female democrat
    df['female_dem'] = 0
    for index, row in df.iterrows():
        if row.sex == 1 and row.party == 1:
            df.set_value(index, 'female_dem', 1)
            
    # remove "(percent) margin of error" columns
    df = df.iloc[:, [index for index, x in enumerate(df.columns) if 'Margin' not in x]]
    
    # remove columns with low percent contributions
    percent_cols = [col for index, col in enumerate(df.columns) if 'Percent' in col and df[col].mean() < 0.05]
    for col in percent_cols:
        df = df.drop(col, axis=1)
        df = df.drop(col.replace("Percent", "Estimate"), axis=1)
        
    return df

In [6]:
def independent_columns(A, tol =0): #= 1e-05):
    Q, R = np.linalg.qr(A)
    independent = np.where(np.abs(R.diagonal()) > tol)[0]
    
    return independent

In [18]:
def logistic_regression(X_train, y_train):
    
    
    loo = LeaveOneOut()
    
#     X_array = np.asarray(X_train.values,np.float32)
#     y_array = np.asarray(y_train.values,np.float32)
    X_array = X_train.values
    y_array = y_train.values
    y_true = []
    y_pred_acc = []
    all_coefs = []
    
    logreg = LR()
    rfe = RFE(logreg, 40)
    rfe = rfe.fit(X_array, y_array)
    cool = rfe.support_
    newX = []
    newX_test = []
    features = []
    count = 0
    for col in X_train.columns:
        if cool[count]:
            features += [col]
        count+= 1
    for i in range(len(X_array)):
        temp = []
        for j in range(len(cool)):
            if cool[j]:
                temp += [X_array[i][j]]
        if len(temp)>0:
            newX += [temp]
    newX = np.array(newX)
    all_good_features = []
    sig_features_dict = {}
    for i in features:
        sig_features_dict[i] = 0
    for train_index, test_index in loo.split(newX):
        X_tr, X_te = newX[train_index], newX[test_index]
        y_tr, y_test = y_array[train_index], y_array[test_index]
        
        classifier = LR()
        
        
        chi, pval = chi2(X_tr, y_tr)
        count = 0
        good_features = []

        for i in range(len(pval)):
            if pval[i] < .05:
                count+=1
                sig_features_dict[features[i]] += pval[i]
        a = classifier.fit(X_tr, y_tr)
        pred = classifier.predict(X_te)
        
        y_true += [y_test[0]]
        y_pred_acc += [pred]

    for i in sig_features_dict:
        sig_features_dict[i] = sig_features_dict[i] / 27
    
#     significant_features = pd.DataFrame.from_dict(sig_features_dict)
    print("Test accuracy: {}".format(accuracy_score(y_true, y_pred_acc)))
    return sig_features_dict, accuracy_score(y_true, y_pred_acc), a

In [8]:
def classify_glm(train_df):
    
    
    loo = LeaveOneOut()
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    

    
    independent = independent_columns(X_train)
    X = X_train.iloc[:, independent]
    print("Rank is {}".format(X_train.shape[1]))
    
    X_array = X.values
    y_array = y_train.values
    y_true = []
    y_pred_acc = []
    

    for train_index, test_index in loo.split(X_array):
        X_tr, X_te = X_array[train_index], X_array[test_index]
        y_tr, y_test = y_array[train_index], y_array[test_index]

        classifier = sm.GLM(y_tr, X_tr)
        results = classifier.fit()
        print(results.summary())
        pred = round(results.predict(X_te)[0])
        y_true += [y_test[0]]
        y_pred_acc += [pred]
    return accuracy_score(y_true, y_pred_acc)

In [9]:
path = '/Users/tianyizhao/Downloads/Senate_features.csv'
senate_data_1 = pd.read_csv(path)
path = 'historical_model/train_data/merged_senate_districts_2009_2012.csv'
senate_data = pd.read_csv(path)
senate_label = pd.read_csv('vulnerability_senate.csv')

In [10]:
path = '/Users/tianyizhao/Downloads/House_features.csv'
house_data_1 = pd.read_csv(path)
path = 'historical_model/train_data/merged_house_districts_2009_2010.csv'
house_data = pd.read_csv(path)
house_label = pd.read_csv('vulnerability_house.csv')

In [11]:
Year = []
for IDX, row in senate_data.iterrows():
    a = row['name']
    
    tmp = senate_data_1[senate_data_1['name'] == a]
    
    sevice = tmp['length_of_service']
    sevice = np.array(sevice)
    if len(sevice) != 0:
        senate_data.at[IDX, 'length_of_service'] = sevice[0]
senate_data = senate_data.drop(columns = ['name','sex','party','District'])
senate_data['Amount'] = senate_data['Amount'].fillna(senate_data['Amount'].mean())
senate_data['vote_count'] = senate_data['vote_count'].fillna(senate_data['vote_count'].mean())
senate_data['vote_percent'] = senate_data['vote_percent'].fillna(senate_data['vote_percent'].mean())
senate_data['length_of_service'] = senate_data['length_of_service'].fillna(1)


In [12]:
Year = []
for IDX, row in house_data.iterrows():
    a = row['name']
    
    tmp = house_data_1[house_data_1['name'] == a]
    
    sevice = tmp['length_of_service']
    sevice = np.array(sevice)
    if len(sevice) != 0:
        house_data.at[IDX, 'length_of_service'] = sevice[0]
house_data = house_data.drop(columns = ['name','sex','party','District'])
house_data['Amount'] = house_data['Amount'].fillna(house_data['Amount'].mean())
house_data['vote_count'] = house_data['vote_count'].fillna(house_data['vote_count'].mean())
house_data['vote_percent'] = house_data['vote_percent'].fillna(house_data['vote_percent'].mean())
house_data['length_of_service'] = house_data['length_of_service'].fillna(1)


In [29]:
print("Senate Logistic Regression: ")
senate_features, accuracy, a = logistic_regression(senate_data, senate_label['vulnerability'])
df = pd.DataFrame.from_dict(senate_features, orient = 'index')
df['p-score'] = df[0]
df = df.drop(columns = 0)
df.to_csv('Senate_Vul.csv')

Senate Logistic Regression: 
Test accuracy: 0.8666666666666667


In [31]:
print("House Logistic Regression: ")
house_features, accuracy, a = logistic_regression(house_data, house_label['vulnerability'])
df = pd.DataFrame.from_dict(house_features, orient = 'index')
df['p-score'] = df[0]
df = df.drop(columns = 0)
df.to_csv('House_Vul.csv')

House Logistic Regression: 
Test accuracy: 0.7701863354037267


In [57]:
path = '/Users/tianyizhao/Downloads/Senate_features.csv'
senate_data_1_test = pd.read_csv(path)
path = 'historical_model/test_data/merged_senate_districts_2015_2016.csv'
senate_data_test = pd.read_csv(path)
senate_label_test = pd.read_csv('vulnerability_senate.csv')
senate_data_test = senate_data_test.drop_duplicates(subset = 'District',keep = 'first')

In [58]:
for IDX, row in senate_data_test.iterrows():
    a = row['District']
    
    tmp = senate_label_test[senate_label_test['District'] == a]
    
    label = tmp['vulnerability']
    label = np.array(label)
    if len(label) != 0:
        senate_data_test.at[IDX, 'label'] = label[0]
senate_label_test = senate_data_test['label']
senate_data_test = senate_data_test.drop(columns = ['label'])
senate_data_test.shape

(40, 355)

In [59]:
Year = []
for IDX, row in senate_data_test.iterrows():
    a = row['name']
    
    tmp = senate_data_1_test[senate_data_1_test['name'] == a]
    
    sevice = tmp['length_of_service']
    sevice = np.array(sevice)
    if len(sevice) != 0:
        senate_data_test.at[IDX, 'length_of_service'] = sevice[0]
senate_data_test = senate_data_test.drop(columns = ['name','sex','party','District'])
senate_data_test['Amount'] = senate_data_test['Amount'].fillna(senate_data_test['Amount'].mean())
senate_data_test['vote_count'] = senate_data_test['vote_count'].fillna(senate_data_test['vote_count'].mean())
senate_data_test['vote_percent'] = senate_data_test['vote_percent'].fillna(senate_data_test['vote_percent'].mean())
senate_data_test['length_of_service'] = senate_data_test['length_of_service'].fillna(1)
senate_data_test.shape

(40, 352)

In [60]:
classifier = LR()
classifier.fit(senate_data.values, senate_label['vulnerability'].values)
pred_senate = classifier.predict_proba(senate_data_test)
pred_senate.shape

(40, 2)

In [61]:
path = 'historical_model/test_data/merged_senate_districts_2015_2016.csv'
senate_data_test = pd.read_csv(path)
senate_label_test = pd.read_csv('vulnerability_senate.csv')
senate_data_test = senate_data_test.drop_duplicates(subset = 'District',keep = 'first')
d = {'District': senate_data_test['District'], 'Prob': pred_senate[:,1]}
df = pd.DataFrame(data = d)
df.to_csv('senate_pred.csv')

In [62]:
path = '/Users/tianyizhao/Downloads/House_features.csv'
house_data_1_test = pd.read_csv(path)
path = 'historical_model/test_data/merged_house_districts_2015_2016.csv'
house_data_test = pd.read_csv(path)
house_label_test = pd.read_csv('vulnerability_house.csv')
house_data_test = senate_data_test.drop_duplicates(subset = 'District',keep = 'first')
house_label_test = house_label_test.drop_duplicates(subset = 'District',keep = 'first')

In [63]:
import re
for IDX, row in house_data_test.iterrows():
    a = row['District']
    strinfo = re.compile('First')
    a = strinfo.sub('1st',a)
    strinfo = re.compile('Second')
    a = strinfo.sub('2nd',a)
    strinfo = re.compile('Third')
    a = strinfo.sub('3rd',a)
    strinfo = re.compile('Fourth')
    a = strinfo.sub('4th',a)
    strinfo = re.compile('Fifth')
    a = strinfo.sub('5th',a)
    tmp = house_label_test[house_label_test['District'] == a]
    label = tmp['vulnerability']
    label = np.array(label)
    if len(label) != 0:
        house_data_test.at[IDX, 'label'] = label[0]
house_label_test = house_data_test['label']
house_data_test = house_data_test.drop(columns = ['label'])
house_data_test

Unnamed: 0,District,Estimate; SEX AND AGE - Total population,Margin of Error; SEX AND AGE - Total population,Percent; SEX AND AGE - Total population,Estimate; SEX AND AGE - Total population - Male,Margin of Error; SEX AND AGE - Total population - Male,Percent; SEX AND AGE - Total population - Male,Percent Margin of Error; SEX AND AGE - Total population - Male,Estimate; SEX AND AGE - Total population - Female,Margin of Error; SEX AND AGE - Total population - Female,...,"Estimate; Total: - $150,000 to $199,999","Margin of Error; Total: - $150,000 to $199,999","Estimate; Total: - $200,000 or more","Margin of Error; Total: - $200,000 or more",Amount,name,sex,party,vote_count,vote_percent
0,"Berkshire, Hampshire, Franklin & Hampden",153420.5,357.0,153420.5,74276.0,313.5,48.4,0.15,79144.5,274.5,...,2900.5,286.0,2870.0,303.0,156446.03,,,,,
1,Bristol & Norfolk,160212.0,844.5,160212.0,78606.0,1057.5,49.1,0.6,81606.0,1020.5,...,6219.5,448.5,7701.5,421.0,235464.64,James E. Timilty,m,Democratic,66611.0,99.3
2,Cape & Islands,170626.0,27.0,170626.0,81311.0,640.0,47.65,0.4,89315.0,641.5,...,4348.5,429.5,4496.0,408.5,353871.73,Julian Andre Cyr,m,Democratic,59974.0,56.9
3,Fifth Middlesex,173154.0,359.5,173154.0,84040.5,1205.0,48.55,0.7,89113.5,1192.5,...,6929.0,497.5,6564.5,433.0,1072782.64,Jason M. Lewis,m,Democratic,52954.0,68.9
4,First Bristol & Plymouth,164643.5,49.5,164643.5,79809.5,988.5,48.5,0.6,84834.0,987.0,...,3289.0,333.5,2198.5,254.0,327750.98,Michael J. Rodrigues,m,Democratic,54618.0,99.4
5,First Essex,175997.0,621.0,175997.0,85585.0,1039.5,48.65,0.55,90412.0,1044.0,...,5797.5,460.5,4661.0,369.0,213896.46,Kathleen A. O'Connor Ives,f,Democratic,65131.0,99.1
6,First Essex & Middlesex,176056.5,621.5,176056.5,86994.5,1042.0,49.4,0.55,89062.0,1055.5,...,7870.0,523.0,9916.0,586.0,767151.46,Bruce E. Tarr,m,Republican,77851.0,99.2
7,First Hampden & Hampshire,163175.5,1854.5,163175.5,78838.5,1333.0,48.35,0.55,84337.0,1265.0,...,4409.5,435.0,3819.5,387.0,612047.27,Eric P. Lesser,m,Democratic,44602.0,55.8
8,First Middlesex,171713.5,83.5,171713.5,84975.5,1055.5,49.5,0.65,86738.0,1058.5,...,5121.5,434.0,4554.5,378.5,211728.92,Eileen M. Donoghue,f,Democratic,56759.0,98.6
9,First Middlesex & Norfolk,164792.5,442.5,164792.5,76256.5,1083.0,46.25,0.65,88536.0,1093.0,...,7186.0,512.5,16984.0,630.5,2492655.2,Cynthia Stone Creem,f,Democratic,62668.0,99.0


In [64]:
Year = []
for IDX, row in house_data_test.iterrows():
    a = row['name']
    
    tmp = house_data_1_test[house_data_1_test['name'] == a]
    
    sevice = tmp['length_of_service']
    sevice = np.array(sevice)
    if len(sevice) != 0:
        house_data_test.at[IDX, 'length_of_service'] = sevice[0]
house_data_test = house_data_test.drop(columns = ['name','sex','party','District'])
house_data_test['Amount'] = house_data_test['Amount'].fillna(house_data_test['Amount'].mean())
house_data_test['vote_count'] = house_data_test['vote_count'].fillna(house_data_test['vote_count'].mean())
house_data_test['vote_percent'] = house_data_test['vote_percent'].fillna(house_data_test['vote_percent'].mean())
house_data_test['length_of_service'] = house_data_test['length_of_service'].fillna(1)



In [65]:
classifier = LR()
classifier.fit(house_data.values, house_label['vulnerability'].values)
pred_house = classifier.predict_proba(house_data_test)
pred_house.shape

(40, 2)

In [66]:
path = 'historical_model/test_data/merged_house_districts_2015_2016.csv'
house_data_test = pd.read_csv(path)
house_label_test = pd.read_csv('vulnerability_house.csv')
house_data_test = senate_data_test.drop_duplicates(subset = 'District',keep = 'first')
d = {'District': house_data_test['District'], 'Prob': pred_house[:,1]}
df = pd.DataFrame(data = d)
df.to_csv('house_pred.csv')

In [67]:
print(house_label_test)

                          District  vulnerability
0                     10th Bristol              0
1                       10th Essex              1
2                     10th Hampden              1
3                   10th Middlesex              1
4                     10th Norfolk              0
5                    10th Plymouth              1
6                     10th Suffolk              0
7                   10th Worcester              1
8                     11th Bristol              0
9                       11th Essex              1
10                    11th Hampden              1
11                  11th Middlesex              0
12                    11th Norfolk              0
13                   11th Plymouth              0
14                    11th Suffolk              0
15                  11th Worcester              1
16                    12th Bristol              1
17                      12th Essex              1
18                    12th Hampden              0


In [25]:
senate_data = read_dataset("VoteBuilder_and_all_data_Senate.csv")
enumerate_districts(senate_data, "Senate")
    
cleaned_senate = clean_data(senate_data,"Senate")

house_data = read_dataset("VoteBuilder_and_all_data_House.csv")
enumerate_districts(house_data, "House")
    
cleaned_house = clean_data(house_data,"House")


print("Senate Logistic Regression: ")
senate_features, accuracy = logistic_regression(cleaned_senate)
print(senate_features)
# print("OLS Generalized Linear Model: ")
# print(classify_glm(cleaned_senate))

print("House Logistic Regression: ")
house_features, house_accuracy = logistic_regression(cleaned_house)

FileNotFoundError: File b'VoteBuilder_and_all_data_Senate.csv' does not exist

In [None]:
# print("OLS Generalized Linear Model: ")
# print(classify_glm(cleaned_senate))

In [None]:
with open('senate_features.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in senate_features.items():
        writer.writerow([key, value])

In [None]:
with open('house_features.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in house_features.items():
        writer.writerow([key, value])