In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
aggression = pd.read_csv('../data/aggression_clean_data.csv')

In [3]:
aggression.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,aggression,aggression_score,label
0,37675,This is not creative Those are the dictionary...,2002,True,article,random,train,0.1,0.0,0
1,44816,the term standard model is itself less NPOV t...,2002,True,article,random,train,0.0,0.111111,0
2,49851,True or false the situation as of March 2002 w...,2002,True,article,random,train,0.0,0.1,0
3,89320,Next maybe you could work on being less conde...,2002,True,article,random,dev,0.444444,-0.444444,0
4,93890,This page will need disambiguation,2002,True,article,random,train,0.0,0.333333,0


In [4]:
aggression['label'].value_counts(normalize=True)

0    0.852805
1    0.147195
Name: label, dtype: float64

In [7]:
def split_data(data, pct_positive, test_size=None, train_size=None,
random_state=None):
    """
    A custom train_test_split implementation that creates a training dataset
    with a specific proportion of positive classes. This is meant to
    combat class imbalance in training data ONLY. Testing data will retain the
    original class proportions.
    Args:
        data -- the dataframe acquired from merge_with_target()
        pct_positive -- desired proportion of positive classes in training data.
            (float between 0 and 1)
        test_size -- number of samples to include in the testing dataset.
            (integer, default None --> 20% of the full dataset)
        train_size -- number of samples to include in the training dataset.
            (integer, default None --> maximum training size for given
            parameters)
    """
    # get test data
    if test_size:
        num_test_samples = int(test_size)
    else:
        num_test_samples = int(0.3 * data.shape[0])

    df_test = data.sample(num_test_samples, random_state=random_state)
    X_test = df_test['comment']
    y_test = df_test['label']

    # get train data
    # separate positive and negative classes
    df_train = data.drop(df_test.index)
    df_train_pos = df_train[df_train['label'] == 1]
    df_train_neg = df_train[df_train['label'] == 0]

    # some input validation for train_size
    max_train_size = int(df_train_pos.shape[0] / pct_positive)
    if not train_size:
        train_size = max_train_size
    elif train_size > max_train_size:
        warnings.warn(f'train_size of {train_size} exceeds the amount of '
        'training data available for the given parameters. '
        f'Resetting train size to {max_train_size}')
        train_size = max_train_size
    else:
        train_size = int(train_size)

    # assemble train dataframe
    num_pos_samples = int(pct_positive * train_size)
    num_neg_samples = train_size - num_pos_samples

    df_train = pd.concat([
        df_train_pos.sample(num_pos_samples, random_state=random_state),
        df_train_neg.sample(num_neg_samples)]).sample(
        frac=1, random_state=random_state)

    X_train = df_train['comment']
    y_train = df_train['label']

    return (X_train, X_test, y_train, y_test)

In [9]:
# Create custom train/test split using function Christian created to create equal classes

X_train, X_test, y_train, y_test = split_data(data = aggression, pct_positive = 0.5)

In [None]:
# Create a dataframe to store model results

column_names = ['model', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'training_score', 'test_score']

model_df = pd.DataFrame(columns = column_names)

# Create function to add results to model df

def add_to_model(df, model, name):
    df = df.append({'model': name,
                'cv__max_df': model.best_params_['cv__max_df'],
                'cv__max_features': model.best_params_['cv__max_features'],
                'cv__min_df': model.best_params_['cv__min_df'],
                'training_score': model.score(X_train, y_train),
                'test_score': model.score(X_test, y_test)},
                ignore_index = True)
    return df

In [82]:
# Create a pipeline to count vectorize and run logistic regression

pipe = Pipeline([
    ('cv', CountVectorizer(stop_words = 'english')),
    ('lr', LogisticRegression(max_iter=1000))
]
)

In [83]:
# Create parameters for pipeline

pipe_params = {
    'cv__max_features': [5_000, 8_000, 10_000, 12_000],
    'cv__min_df'      : [2, 3],
    'cv__max_df'      : [.9, .95]
}

In [84]:
# Instantiate gridsearch

gs = GridSearchCV(
    estimator  = pipe,
    param_grid = pipe_params,
    cv = 5
)

In [85]:
# Fit the gridsearch

gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(stop_words='english')),
                                       ('lr',
                                        LogisticRegression(max_iter=1000))]),
             param_grid={'cv__max_df': [0.9, 0.95],
                         'cv__max_features': [5000, 8000, 10000, 12000],
                         'cv__min_df': [2, 3]})

In [86]:
model_df = add_to_model(model_df, gs, 'logistic regression - stop words')

In [87]:
model_df

Unnamed: 0,model,cv__max_df,cv__max_features,cv__min_df,training_score,test_score
0,logistic regression,0.9,8000,2,0.941937,0.867636
1,logistic regression,0.9,10000,2,0.948069,0.870205
2,logistic regression,0.9,12000,2,0.952824,0.870523
3,logistic regression - stop words,0.9,12000,3,0.945691,0.866049


In [88]:
model_df.to_csv('../data/lr_model_results.csv', index = False)