# 4526 Midterm Solution:
Arun Agarwal

## Imports:

In [50]:
#Basic Imports:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import math
from collections import Counter

# Plotting Imports
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import tree
from sklearn.datasets import make_blobs
from sklearn import datasets

# Preprocessing Imports
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text
import pickle
import warnings
warnings.filterwarnings("ignore")
from random import shuffle
import random
random.seed(123)

# Model Imports:
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection

# Metrics Import
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report 
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics.cluster import normalized_mutual_info_score

## Functions:

In [7]:
def get_baseline_results_for_single_classifier(data, target_col, clf, filename):
    X, y = data.drop(target_col, axis=1), data[target_col]
    X_tr, X_te, y_tr, y_te = train_test_split(X,y,test_size=0.2)
    clf = clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    report = classification_report(y_te, y_pred, output_dict=True)
    pd.DataFrame(report).transpose().to_csv(f'../Data/baseline_results/{filename}.csv')
    return report

In [8]:
def decompose_error(model, data, target_name, loss='mse'):
    X, y = data.drop(target_name, axis=1), data[target_name]
    X_tr, X_te, y_tr, y_te = train_test_split(X,y,test_size=0.2)
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        model, X_tr.values, y_tr.values, X_te.values, y_te.values, 
        loss=loss,
        random_seed=42)
    print("Average Expected Loss: ", avg_expected_loss)
    print("Average Bias: ", avg_bias)
    print("Average Variance: ", avg_var)
    
    return avg_expected_loss, avg_bias, avg_var

In [41]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [42]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

In [48]:
def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

In [119]:
def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0 ):
    model = LogisticRegression(C=_C).fit(X_tr, y_tr)
    y_pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    report = classification_report(y_test, y_pred,output_dict=True)
    pd.DataFrame(report).transpose().to_csv(f'LogisticRegression{description}.csv')
    return model

## Data Preprocessing:

In [123]:
train_df = pd.read_csv("train_with_label.txt", delimiter = "r'\t", header = None, engine = 'python')
train_df = train_df[0].str.split("\t", expand=True)
train_df = train_df.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
train_df["classification"] = pd.to_numeric(train_df["classification"])
train_df.drop_duplicates(inplace = True)

lemm = nltk.stem.WordNetLemmatizer()

train_df['Text_Cleaned1'] = list(map(clean_text, train_df.sentence1))
train_df['lemmatized_text1'] = list(map(lambda word:list(map(lemm.lemmatize, word)),train_df.Text_Cleaned1))
train_df['Text_Cleaned2'] = list(map(clean_text, train_df.sentence2))
train_df['lemmatized_text2'] = list(map(lambda word:list(map(lemm.lemmatize, word)),train_df.Text_Cleaned2))

In [124]:
cosinelist = []
similaritylist = []
for index, row in train_df.iterrows():
    #Cosine Similarity:
    counter1 = Counter(row['lemmatized_text1'])
    counter2 = Counter(row['lemmatized_text2'])
    cosine_score = counter_cosine_similarity(counter1, counter2)
    cosinelist.append(cosine_score)
    
    #Length Similarity:
    lenc1 = sum(iter(counter1.values()))
    lenc2 = sum(iter(counter2.values()))
    lengthSim = min(lenc1, lenc2) / float(max(lenc1, lenc2))
    
    #Similarity Score:
    similarityScore = lengthSim * cosine_score
    similaritylist.append(similarityScore)
train_df['cosine_similarity_score'] = cosinelist
train_df['overall_similarity_score'] = similaritylist
train_df

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,lemmatized_text1,Text_Cleaned2,lemmatized_text2,cosine_similarity_score,overall_similarity_score
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1,"[democratic, candidates, also, began, announci...","[democratic, candidate, also, began, announcin...","[democratic, candidates, also, began, announci...","[democratic, candidate, also, began, announcin...",0.909509,0.856008
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1,"[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...","[woman, exposed, sars, virus, hospital, health...",0.904534,0.804030
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1,"[said, problem, needs, corrected, space, shutt...","[said, problem, need, corrected, space, shuttl...","[said, prob, lem, needs, corrected, space, shu...","[said, prob, lem, need, corrected, space, shut...",0.777778,0.777778
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0,"[representative, phoenix, based, u, haul, decl...","[representative, phoenix, based, u, haul, decl...","[anthony, citrano, representative, whenu, decl...","[anthony, citrano, representative, whenu, decl...",0.455842,0.290081
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1,"[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...","[biggest, threat, order, seemed, looting, crim...",0.721688,0.541266
...,...,...,...,...,...,...,...,...,...,...
4072,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1,"[axelrod, died, sleep, heart, failure, said, d...","[axelrod, died, sleep, heart, failure, said, d...","[axelrod, died, heart, failure, asleep, los, a...","[axelrod, died, heart, failure, asleep, los, a...",0.805823,0.604367
4073,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1,"[saddam, son, odai, surrendered, friday, ameri...","[saddam, son, odai, surrendered, friday, ameri...","[hussein, son, uday, surrendered, yesterday, a...","[hussein, son, uday, surrendered, yesterday, a...",0.700000,0.700000
4074,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1,"[senator, clinton, decide, run, 2008, cannot, ...","[senator, clinton, decide, run, 2008, cannot, ...","[mrs, clinton, decide, contest, 2008, election...","[mr, clinton, decide, contest, 2008, election,...",0.819892,0.756823
4075,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1,"[iranian, refugee, sewed, eyes, lips, ears, pr...","[iranian, refugee, sewed, eye, lip, ear, prote...","[iranian, kurd, stitched, eyes, lips, ears, pr...","[iranian, kurd, stitched, eye, lip, ear, prote...",0.560449,0.517337


In [125]:
dev_df = pd.read_csv("dev_with_label.txt", delimiter = "r'\t", header = None, engine = 'python')
dev_df = dev_df[0].str.split("\t", expand=True)
dev_df = dev_df.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
dev_df["classification"] = pd.to_numeric(dev_df["classification"])
dev_df.drop_duplicates(inplace = True)

lemm = nltk.stem.WordNetLemmatizer()

dev_df['Text_Cleaned1'] = list(map(clean_text, dev_df.sentence1))
dev_df['lemmatized_text1'] = list(map(lambda word:list(map(lemm.lemmatize, word)),dev_df.Text_Cleaned1))
dev_df['Text_Cleaned2'] = list(map(clean_text, dev_df.sentence2))
dev_df['lemmatized_text2'] = list(map(lambda word:list(map(lemm.lemmatize, word)),dev_df.Text_Cleaned2))

In [126]:
cosinelist = []
similaritylist = []
for index, row in dev_df.iterrows():
    #Cosine Similarity:
    counter1 = Counter(row['lemmatized_text1'])
    counter2 = Counter(row['lemmatized_text2'])
    cosine_score = counter_cosine_similarity(counter1, counter2)
    cosinelist.append(cosine_score)
    
    #Length Similarity:
    lenc1 = sum(iter(counter1.values()))
    lenc2 = sum(iter(counter2.values()))
    lengthSim = min(lenc1, lenc2) / float(max(lenc1, lenc2))
    
    #Similarity Score:
    similarityScore = lengthSim * cosine_score
    similaritylist.append(similarityScore)
dev_df['cosine_similarity_score'] = cosinelist
dev_df['overall_similarity_score'] = similaritylist
dev_df

Unnamed: 0,id,sentence1,sentence2,classification,Text_Cleaned1,lemmatized_text1,Text_Cleaned2,lemmatized_text2,cosine_similarity_score,overall_similarity_score
0,dev_id_0,Local police authorities are treating the expl...,Acting New Haven Police Chief Francisco Ortiz ...,0,"[local, police, authorities, treating, explosi...","[local, police, authority, treating, explosion...","[acting, new, police, chief, francisco, ortiz,...","[acting, new, police, chief, francisco, ortiz,...",0.534522,0.400892
1,dev_id_1,The report shows that drugs sold in Canadian p...,The report shows that drugs sold in Canadian p...,1,"[report, shows, drugs, sold, canadian, pharmac...","[report, show, drug, sold, canadian, pharmacy,...","[report, shows, drugs, sold, canadian, pharmac...","[report, show, drug, sold, canadian, pharmacy,...",0.802955,0.661257
2,dev_id_2,The transition is slated to begin no later tha...,A two-week transition period will begin no lat...,1,"[transition, slated, begin, later, june, 7, da...","[transition, slated, begin, later, june, 7, da...","[two, week, transition, period, begin, later, ...","[two, week, transition, period, begin, later, ...",0.625000,0.625000
3,dev_id_3,"Like Viacom , GE -- parent of NBC -- is also s...","Like Viacom , General Electric is seen as a le...",1,"[like, viacom, ge, parent, nbc, also, seen, le...","[like, viacom, ge, parent, nbc, also, seen, le...","[like, viacom, general, electric, seen, less, ...","[like, viacom, general, electric, seen, le, en...",0.753778,0.592254
4,dev_id_4,"Last month , 62 Spanish peacekeepers died when...","In another disaster , 62 Spanish peacekeepers ...",1,"[last, month, 62, spanish, peacekeepers, died,...","[last, month, 62, spanish, peacekeeper, died, ...","[another, disaster, 62, spanish, peacekeepers,...","[another, disaster, 62, spanish, peacekeeper, ...",0.585369,0.495313
...,...,...,...,...,...,...,...,...,...,...
719,dev_id_719,"He is a brother to three-year-old Mia , from K...","Winslet , 28 , has a three-year-old daughter M...",0,"[brother, three, year, old, mia, kate, first, ...","[brother, three, year, old, mia, kate, first, ...","[winslet, 28, three, year, old, daughter, mia,...","[winslet, 28, three, year, old, daughter, mia,...",0.694365,0.595170
720,dev_id_720,Some 175 million shares traded on the Big Boar...,Some 1.6 billion shares traded on the Big Boar...,0,"[175, million, shares, traded, big, board, 7, ...","[175, million, share, traded, big, board, 7, p...","[1, 6, billion, shares, traded, big, board, 17...","[1, 6, billion, share, traded, big, board, 17,...",0.462910,0.396780
721,dev_id_721,Mr Berlusconi is accused of bribing judges to ...,Mr Berlusconi is accused of bribing judges to ...,1,"[mr, berlusconi, accused, bribing, judges, inf...","[mr, berlusconi, accused, bribing, judge, infl...","[mr, berlusconi, accused, bribing, judges, inf...","[mr, berlusconi, accused, bribing, judge, infl...",0.716115,0.620633
722,dev_id_722,"He added that those "" are not solely American ...",""" These are not solely American principles nor...",1,"[added, solely, american, principles, exclusiv...","[added, solely, american, principle, exclusive...","[solely, american, principles, exclusively, we...","[solely, american, principle, exclusively, wes...",0.771517,0.661300


### Logistic Regression Model 1: Using just Similarity Score:

In [127]:
x_train = train_df[['cosine_similarity_score']]
y_train = train_df['classification'].values
x_train_val = dev_df[['cosine_similarity_score']]
y_train_val = dev_df['classification'].values

print("train: {}, val: {}".format(x_train.shape[0], x_train_val.shape[0]))
print(x_train.shape)
print(x_train_val.shape)

train: 4077, val: 724
(4077, 1)
(724, 1)


In [128]:
model_simple = simple_logistic_classify(x_train, y_train, x_train_val, y_train_val, 'Cosine Similarity Score')

Test Score with Cosine Similarity Score features 0.5814917127071824


### Logistic Regression Models 2 and 3: Using bow and tf-idf features:

In [129]:
x_combinedCols_train = train_df['Text_Cleaned1'] + train_df['Text_Cleaned2']
x_combinedCols_val = dev_df['Text_Cleaned1'] + dev_df['Text_Cleaned2']

bow_converter = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
x = bow_converter.fit_transform(x_combinedCols_train)

words = bow_converter.get_feature_names()
len(words)

13042

In [130]:
bigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[2,2], lowercase=False) 
x2 = bigram_converter.fit_transform(x_combinedCols_train)
bigrams = bigram_converter.get_feature_names()
len(bigrams)

60828

In [131]:
trigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3], lowercase=False) 
x3 = trigram_converter.fit_transform(x_combinedCols_train)
trigrams = trigram_converter.get_feature_names()
len(trigrams)

71562

In [132]:
quadgram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[4,4], lowercase=False) 
x3 = quadgram_converter.fit_transform(x_combinedCols_train)
quadgrams = quadgram_converter.get_feature_names()
len(quadgrams)

73786

In [135]:
fivegram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[5,5], lowercase=False) 
x3 = fivegram_converter.fit_transform(x_combinedCols_train)
fivegrams = fivegram_converter.get_feature_names()
len(fivegrams)

73687

In [136]:
# x_train = train_df[['Text_Cleaned1']].values
# y_train = train_df['classification'].values
# x_train_val = dev_df[['Text_Cleaned1', 'Text_Cleaned2']].values
# y_train_val = dev_df['classification'].values

x_train = x_combinedCols_train.values
y_train = train_df['classification'].values
x_train_val = x_combinedCols_val.values
y_train_val = dev_df['classification'].values

print("train: {}, val: {}".format(x_train.shape[0], x_train_val.shape[0]))
print(x_train.shape)
print(x_train_val.shape)

train: 4077, val: 724
(4077,)
(724,)


In [146]:
bow_transform = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, ngram_range=[1,1])
X_tr_bow = bow_transform.fit_transform(x_train)
X_te_bow = bow_transform.transform(x_train_val)
len(bow_transform.vocabulary_)

13042

In [147]:
#Tf-Idf Transformation:
tfidf_transform = text.TfidfTransformer(norm=None)
X_tr_tfidf = tfidf_transform.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_transform.transform(X_te_bow)

In [148]:
model_bow = simple_logistic_classify(X_tr_bow, y_train, X_te_bow, y_train_val, 'bow')
model_tfidf = simple_logistic_classify(X_tr_tfidf, y_train, X_te_tfidf, y_train_val, 'tf-idf')

Test Score with bow features 0.606353591160221
Test Score with tf-idf features 0.5953038674033149


In [16]:
#We need to learn the model parameter  𝐰 . 
#However, with different hyperparameters  𝜆 , we can get different model parameter  𝐰 , resulting in different prediction performance. 
#Thus, we will use the 10-fold cross-validation to select the hyperparameter  𝜆 .

#Here we set the folds equal to 10 for 10-fold cross-validation
folds = 9

#We get the number of samples in the training and validation set
num_train = x_train.shape[0] 

#Now, we shuffle the index of samples in the train_val set
index_of_samples = np.arange(num_train) 
shuffle(index_of_samples)

#We split the index of the train_valid set into 10 folds
index_of_folds = index_of_samples.reshape(folds, -1)
print(index_of_folds)

#As suggested above, the hyperparameters chosen are listed below
regularization_coefficient = [10**(-5), 10**(-3), 10**(-2), 10**(-1), 1, 10, 20, 50]

#Variables we create to store the values of the best accuracy and best regression:
best_acc = 0.0
best_reg = 0.0

for reg in regularization_coefficient:
    #10-fold cross-validation
    sum_acc = 0.0
    for fold in range(folds):
        
        index_of_folds_temp = index_of_folds.copy()
        
        #We are getting the index of the validation set and storing it in a variable valid_index
        valid_index = index_of_folds_temp[fold,:].reshape(-1) 
        #We are getting the index of the training set and storing it in a variable train_index
        train_index = np.delete(index_of_folds_temp, fold, 0).reshape(-1)
        
        #Our training set:
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        
        #Our validation set:
        X_valid = x_train[valid_index]
        Y_valid = y_train[valid_index]
                
        #We write this to build the model with different hyperparameters:
        clf = LogisticRegression(penalty='l2', C=reg, solver='lbfgs')
        
        #Train the model with the training set:
        clf.fit(X_train, Y_train)
        
        y_valid_pred = clf.predict(X_valid)
        acc = accuracy_score(Y_valid, y_valid_pred)
        
        sum_acc += acc
    
    cur_acc = sum_acc / folds
    
    print("reg_coeff: {}, acc: {:.3f}".format(1.0/reg, cur_acc))
    
    #We now want to store the best hyperparameter:
    if cur_acc > best_acc:
        best_acc = cur_acc
        best_reg = reg
        
print("Best Accuracy: {:.4f} ".format(best_acc))
print("Best Reg: {:}".format(best_reg))

[[1082 3902 4010 ...   82 1919 3978]
 [2484  566  717 ... 1031 3427 1207]
 [1351 3459 1572 ... 2117 4029 2607]
 ...
 [1085 3641   99 ...  375 2251 2706]
 [2167 2185 3339 ... 2818  932 1583]
 [2989 1313   57 ...  357 1096  214]]


ValueError: could not convert string to float: "If Walker appeals Parrish 's ruling , it would stop the extradition process and could take several months , Rork said ."

In [None]:
#Normalizing the features:
normalizer = StandardScaler()
X_train_val = normalizer.fit_transform(X_train_val)
X_test = normalizer.transform(X_test)

print(X_test.shape)
print(X_train_val.shape)