# Polynomial Features

## Imports and functions

In [1]:
DSET_FOLDER_PATH = './dataset/quora/'
import pandas as pd
from tqdm import tqdm 
tqdm.pandas()
import numpy as np
train_dset_df = pd.read_csv(DSET_FOLDER_PATH + "train.csv")
test_dset_df = pd.read_csv(DSET_FOLDER_PATH + "test.csv")

In [2]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

def summarize(model, X, y):
    yhat = np.round(model.predict(X))
    print("F1 score:", f1_score(y, yhat))
    print("Precision:", precision_score(y, yhat))
    print("Recall:", recall_score(y, yhat))
    print("Confusion matrix:")
    print(confusion_matrix(y, yhat))
    return f1_score(y, yhat)

def cross_validate(model, n_folds, X, y):
    kfcv = StratifiedKFold(n_splits=n_folds)
    i = 0
    validation_f1_scores = []
    for train_indices, test_indices in kfcv.split(X,y):
        print("Round number", i)
        i += 1
        trainset_X = X[train_indices,:]
        testset_X  = X[test_indices,:]
        trainset_y = y[train_indices]
        testset_y  = y[test_indices]
        model.fit(trainset_X, trainset_y)
        print("\n\nTraining:")
        summarize(model,trainset_X, trainset_y)
        print("Testing:")
        validation_f1_scores.append(summarize(model, testset_X, testset_y))
        print(("-"*15))
    validation_f1_scores = np.array(validation_f1_scores)
    print("Mean validation f1 score:", np.mean(validation_f1_scores))
    print("Median validation f1 score:", np.median(validation_f1_scores))

## Vectorization and Polynomial Features

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False, token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'", ngram_range=(1,1))

In [4]:
train_X = vectorizer.fit_transform(train_dset_df["question_text"])
train_y = train_dset_df["target"].to_numpy()

In [5]:
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree=2)
pf.fit_transform(train_X)

ValueError: negative column index found

## Model Cross-Validation

In [5]:
class Logistic_Separator:
    '''
    Using the decision_function of the first model, fit a logistic curve to the output of the first model. 
    '''
    def __init__(self, decision_model, logistic_model):
        self.decision_model = decision_model 
        self.logistic_model = logistic_model 
    def fit(self,X, y):
        print("Fitting the base model.")
        self.decision_model.fit(X, y)
        print("Fitting the intermediate")
        intermediate = self.decision_model.decision_function(X).reshape((-1,1))
        self.logistic_model.fit(intermediate, y)
    def predict(self, X):
        intermediate = self.decision_model.decision_function(X).reshape((-1,1))
        return self.logistic_model.predict(intermediate)

In [6]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


In [28]:
lsvm = LinearSVC(C=0.24, class_weight={0:1,1:1}, max_iter=10000)
logistic = LogisticRegression(C=1, class_weight = {0:1,1:3.8}, max_iter=1000, n_jobs=6)
model=Logistic_Separator(lsvm, logistic)


In [29]:
cross_validate(model, 10, train_X, train_y)

Round number 0
Fitting the base model.
Fitting the intermediate


Training:
F1 score: 0.893729287989447
Precision: 0.8394342929669399
Recall: 0.9555336421593359
Confusion matrix:
[[653729   7970]
 [  1939  41667]]
Testing:
F1 score: 0.6387802186779015
Precision: 0.6213895394223263
Recall: 0.657172342621259
Confusion matrix:
[[71583  1940]
 [ 1661  3184]]
---------------
Round number 1
Fitting the base model.
Fitting the intermediate


Training:
F1 score: 0.8946820945619886
Precision: 0.8402658878034042
Recall: 0.9566344081089758
Confusion matrix:
[[653769   7930]
 [  1891  41715]]
Testing:
F1 score: 0.6353768844221106
Precision: 0.6191968658178256
Recall: 0.6524251805985553
Confusion matrix:
[[71579  1944]
 [ 1684  3161]]
---------------
Round number 2
Fitting the base model.
Fitting the intermediate


Training:
F1 score: 0.8938383340124821
Precision: 0.8394328183850631
Recall: 0.9557848870542369
Confusion matrix:
[[653728   7972]
 [  1928  41677]]
Testing:
F1 score: 0.6314532975231274

lsvm = LinearSVC(C=0.24, class_weight={0:1,1:1}, max_iter=10000)

logistic = LogisticRegression(C=1, class_weight = {0:1,1:3.8}, max_iter=1000, n_jobs=6)

model=Logistic_Separator(lsvm, logistic)


trigrams

Mean validation f1 score: 0.633484221217779

Median validation f1 score: 0.633362960290482

## Testset Write

In [30]:
test_X  = vectorizer.transform(test_dset_df["question_text"])
model.fit(train_X, train_y)

Fitting the base model.
Fitting the intermediate


In [31]:
test_yhat = model.predict(test_X)
output_df = test_dset_df.copy()
output_df.info()
output_df.drop(inplace=True, axis=1, labels="question_text")
output_df["preprocessed_joined"] = test_yhat
output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
output_df.target = output_df.target.apply(round)
output_df.to_csv("./outputs/2020_11_28_c_testset_output.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   qid            522449 non-null  object
 1   question_text  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB


In [32]:
output_df

Unnamed: 0,qid,target
0,f56a9a31974dc66186e8,0
1,d957c3758060f45da303,0
2,ad822d5abaedb9e247b9,0
3,4e979c23eeb6a4bd1f2e,0
4,333cc031262566b8da49,0
...,...,...
522444,e8e6aa5226f36c27fe41,0
522445,015fd068afcb9d0b4007,0
522446,9f0ef49eff6a3ff9e735,0
522447,d6b02f52f76dc4c22afd,0
