In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split

from scipy import stats
from sklearn.linear_model import LinearRegression

# python display
from IPython.display import HTML, display_html, display

#Feature Selection
from sklearn.feature_selection import SelectKBest, f_regression, RFE

from prepare import *

In [20]:
df, revert_key, replace_key  = wrangle_data()

In [21]:
df.head()

Unnamed: 0,qkey,device_type,language,form,attitude,avg_family,happen_general,happen_health,happen_race,happen_usa,...,family_income_three_categories,registered_to_vote,political_views,internet_access,volunteer,weight,is_pes,pes_val,is_very_pes,is_very_opt
0,100363,Mobile phone,English,Form 1,Somewhat optimistic,Get worse,The U.S. economy will be STRONGER,Health care will be MORE affordable,Race relations will IMPROVE,The U.S. will be MORE important in the world,...,"$75,000+",You are ABSOLUTELY CERTAIN that you are regist...,Liberal,Internet User,No,0.599006,0,1,0,0
1,101224,Mobile phone,English,Form 1,Somewhat optimistic,Get better,The U.S. economy will be STRONGER,Health care will be LESS affordable,Race relations will IMPROVE,The U.S. will be MORE important in the world,...,"$30-$74,999",You are ABSOLUTELY CERTAIN that you are regist...,Conservative,Internet User,Yes,0.292981,0,1,0,0
2,101437,Desktop,English,Form 1,Somewhat pessimistic,Get worse,The U.S. economy will be WEAKER,Health care will be LESS affordable,Race relations will GET WORSE,The U.S. will be LESS important in the world,...,"<$30,000",You are ABSOLUTELY CERTAIN that you are regist...,Conservative,Internet User,No,0.418871,1,2,0,0
3,102130,Mobile phone,English,Form 1,Somewhat optimistic,Stay about the same,The U.S. economy will be WEAKER,Health care will be LESS affordable,Race relations will IMPROVE,The U.S. will be LESS important in the world,...,Don't know/Refused,You are ABSOLUTELY CERTAIN that you are regist...,Moderate,Non Internet User,Yes,0.342058,0,1,0,0
4,103094,Mobile phone,English,Form 1,Somewhat optimistic,Stay about the same,Refused,Health care will be LESS affordable,Refused,The U.S. will be LESS important in the world,...,"<$30,000",You are NOT registered to vote at your current...,Liberal,Internet User,Yes,0.329465,0,1,0,0


In [67]:
class Attitudes_explore():
    '''
    Performs a series of analyses and explore functions on various features in our data.
    '''
    
    def __init__(self, df: pd.DataFrame, target: str, subset_title: str):
        '''
        Initializes the self of the class.
        '''
        self.target = target
        
        self.subset_title = subset_title
        
        # Pull the categorical features from dataframe
        try: 
            self.categorical_features = df.drop(columns = ['qkey', "weight", target]).columns.to_list()
        except:
            self.categorical_features = df.drop(columns= [target]).columns.to_list()
        
        # Split the dataframe
        train, validate, test = train_validate_test_split(df, target)
        
        # Make train datasets
        self.X_train = train.drop(columns=[target])
        self.y_train = train[target]
        
        # Make validate datasets
        self.X_validate = validate.drop(columns=[target])
        self.y_validate = validate[target]
        
        # Make test datasets
        self.X_test = test.drop(columns=[target])
        self.y_test = test[target]
        
        
        #Make dummies 
        df_dummies = pd.get_dummies(df, drop_first = True)
        
        #Split the dummies dataframe
        train_dummies, validate_dummies, test_dummies = train_validate_test_split(df_dummies, target)

        # Make train_dummies datasets
        self.X_train_dummies = train_dummies.drop(columns=[target])
        self.y_train_dummies = train_dummies[target]
        
        # Make validate_dummies datasets
        self.X_validate_dummies = validate_dummies.drop(columns=[target])
        self.y_validate_dummies = validate_dummies[target]
        
        # Make test_dummies datasets
        self.X_test_dummies = test_dummies.drop(columns=[target])
        self.y_test_dummies = test_dummies[target]
        
        
        
        
    def run_statistical_tests(self):
        '''This method will iterate though the categorical feature columns and run various statistical tests
        and will print the results for each test.
        The tests being performed are:
        chi_squared
        significant_p_val
        list_significant_columns
        insignificant_p_val
        list_insignificant_columns
        '''
        
        # Create column_based_dict statistic attributes
        self.chi2_df = pd.DataFrame(columns=['chi2', 'p_val', 'deg_free', 'expected_freq'])
        
        # Iterate through the categorical features
        for col in self.categorical_features:
            
            #Create contingency table
            contingency_table = pd.crosstab(self.X_train[col], self.y_train)
            
            #Get test results of chi-squared test
            chi2, p, deg_free, expect_freq = stats.chi2_contingency(contingency_table)
            # Add to the dataframe
            self.chi2_df.loc[col]= [chi2, p, deg_free, expect_freq]
    
    def plot_bar_graphs(self, n=5, saved=False):
        '''Plots the target and each variable for top 'n' results from the chi2 test
        '''
        display(HTML(f'''<html><h1>{self.subset_title}</h1></html>'''))
        # Select the n most relevant p_vals
        
        for col in self.chi2_df.head(n).index:
            plt.figure(figsize=(10, 5))
            sns.barplot(x= self.y_train, y = self.X_train[col]).set_title(
                label=f'{self.target.title()} vs {col.title()} Barplot')
            plt.xticks(rotation = 90, horizontalalignment='right', fontsize = 12)
            if saved:
                plt.savefig(f"images/{self.subset_title.replace(' ', '_').lower()}{col.lower()}_bar_plot.png")
            plt.show()
            
    def countplots(self, n=5, saved=False):
        '''Runs the countplot method from Seaborn on the top n=5 (in terms of lowest p-val) columns
        from the dataset, and then shows the plots. If the 'saved' flag is True, it will save the plts to the images
        folder to be read into the README.
        '''
        HTML(f'''<html><h1><center>{self.subset_title}</h1></center></html>''')
    

In [68]:
def generate_and_return_obj(df: pd.DataFrame):
    # This will drop the columns that you don't want to target yet
    tmp = df[[col for col in df.columns if col not in [
        'pes_val', 'is_very_pes', 'is_very_opt', 'attitude', 'avg_family']]]

    # Create instance
    o = Attitudes_explore(tmp, target='is_pes', subset_title = 'Whole dataset with top 5 lowest p-vals')

    # Create accessable dict results
    o.run_statistical_tests()
    # Return the object that was created with the statistical tests being run
    return o
    

In [69]:
o = generate_and_return_obj(df)

In [71]:
o.X_train_dummies

Unnamed: 0,qkey,weight,device_type_Mobile phone,device_type_Tablet,language_Spanish,form_Form 2,happen_general_The U.S. economy will be STRONGER,happen_general_The U.S. economy will be WEAKER,happen_health_Health care will be MORE affordable,happen_health_Refused,...,registered_to_vote_You are NOT registered to vote at your current address,"registered_to_vote_You are PROBABLY registered, but there is a chance your registration has lapsed",political_views_Liberal,political_views_Moderate,political_views_Refused,political_views_Very conservative,political_views_Very liberal,internet_access_Non Internet User,volunteer_Refused,volunteer_Yes
1859,201801113305,3.057078,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
96,191336,0.433484,0,0,0,1,0,1,0,1,...,0,1,1,0,0,0,0,0,0,0
420,668019,2.413983,0,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1424,201801054651,0.362586,1,0,0,1,0,1,1,0,...,0,0,1,0,0,0,0,0,0,0
114,200943,0.303046,0,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678,201501605240,2.336849,0,1,0,1,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
957,201701615366,0.553714,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
239,333639,1.368277,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1795,201801105856,1.338218,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Now we will begin building our functions for modeling

In [72]:
def test_a_model(X_train, y_train, X_validate, y_validate, model, model_name, score_df):
    '''
    Function takes in X and y train
    X and y validate (or test) 
    A model with it's hyper parameters
    And a df to store the scores 
    - Set up an empty dataframe with score_df first
    - score_df = pd.DataFrame(columns = ['model_name', 'train_score', 'validate_score'])
    '''
    this_model = model

    this_model.fit(X_train, y_train)

    # Check with Validate

    train_score = this_model.score(X_train, y_train)
    
    validate_score = this_model.score(X_validate, y_validate)
    
    model_dict = {'model_name': model_name, 
                  'train_score': train_score, 
                  'validate_score':validate_score}
    score_df = score_df.append(model_dict, ignore_index = True)
    
    return score_df

########### Evaluation metrics printing function

def print_metrics(model, X, y, pred, class_names, set_name = 'This Set'):
    '''
    This function takes in a model, 
    X dataframe
    y dataframe 
    predictions 
    Class_names (aka ['Java', 'Javascript', 'Jupyter Notebook', 'PHP'])
    and a set name (aka train, validate or test)
    Prints out a classification report 
    and confusion matrix as a heatmap
    To customize colors change insdie the function
    - IMPORTANT change lables inside this function
    '''
    
    
    print(model)
    print(f"~~~~~~~~{set_name} Scores~~~~~~~~~")
    print(classification_report(y, pred))
    
    #purple_cmap = sns.cubehelix_palette(as_cmap=True)
    purple_cmap = sns.color_palette("light:indigo", as_cmap=True)
    
    with sns.axes_style("white"):
        matrix = plot_confusion_matrix(model,X, y, display_labels=class_names, 
                                       cmap = purple_cmap)
        plt.grid(False)
        plt.show()
        print()


######### This function makes models and prints metrics (uses above function)
#### can run in a loop to loop through models 

def make_models_and_print_metrics(model, model_name, X_train, y_train, X_validate, y_validate, class_names):
    '''
    This function takes in a model object,
    Name for the model (for vis purposes)
    X_train, y_train
    X_validate and y_validate
    and the names of your classes (aka category names)
    Uses print metrics function 
    '''
    model.fit(X_train, y_train)

    #predict for train and validate
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_validate)
    
    print(f'                   ============== {model_name} ================           ')
    #see metrics for train
    print_metrics(model, X_train, y_train, train_pred, class_names, set_name='Train')
    #print metrics for validate
    print_metrics(model, X_validate, y_validate, val_pred, class_names, set_name='Validate')
    print('-------------------------------------------------------------------\n')


######### Function for evaluating the final Test data ################ 

def make_models_and_print_metrics_test_data(model, model_name, X_train, y_train, X_test, y_test, class_names):
    '''
    This function takes in a model object,
    Name for the model (for vis purposes)
    and the names of your classes (aka category names)
    Uses print metrics function 
    Use this function on the final test data set. 
    '''
    model.fit(X_train, y_train)

    test_pred = model.predict(X_test)
    
    print(f'                   ============== {model_name} ================           ')
    #print metrics for Test
    print_metrics(model, X_test, y_test, test_pred, class_names, set_name='Test')
    print('------------------------------------')

## DataFrame for Storing the Results of the Models

In [79]:
# create dataframe to store the scores
score_df = pd.DataFrame(columns = ['model_name', 'train_score', 'validate_score'])

## Get Baseline Accuracy

In [74]:
#PHP is the baseline prediction
print(f'Baseline Accuracy: {(1.00 - round(o.y_train_dummies.mean(), 4))* 100}%')

Baseline Accuracy: 55.75%


## Import Models and Get Model List

In [80]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression

In [81]:
# create list of models to loop through
model_list = [MultinomialNB(), LinearSVC(), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier(), LogisticRegression()]

# create list of model names that correspond to models
model_names = ['Naive_Bayes_all_features', 'SVC_all_features', 'Decision_tree_all_features', 
               'Random_forest_all_features', 'KNN_tfidf_all_features', 'Log_reg_all_features']


In [82]:
#Append results to score_df
for model, name in zip(model_list, model_names):
    score_df = test_a_model(o.X_train_dummies, o.y_train_dummies, o.X_validate_dummies, o.y_validate_dummies, model, name, score_df)



In [83]:
score_df

Unnamed: 0,model_name,train_score,validate_score
0,Naive_Bayes_all_features,0.769176,0.763245
1,SVC_all_features,0.557528,0.557947
2,Decision_tree_all_features,1.0,0.683775
3,Random_forest_all_features,1.0,0.778146
4,KNN_tfidf_all_features,0.629972,0.509934
5,Log_reg_all_features,0.557528,0.557947
