In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
training = pd.read_csv('stack_stats_2020_train.csv')
testing = pd.read_csv('stack_stats_2020_test.csv')


In [3]:
#stackoverflowdata
#joining test and train data to make it easier to pre-process and clean all the data at one and then split them later on
stack_data = pd.concat([training, testing])
stack_data.head()

Unnamed: 0,Id,Score,Body,Title,Tags
0,495560,1,<p>I have a set of data that I am transforming...,R: emmeans back tranform clr data using clrInv,<r><mixed-model><linear><lsmeans>
1,489896,0,<p>We are sending a one bit message to someone...,Trying to determine the failure rate of redund...,<probability><python>
2,497951,2,<p>I am aware that there is a similar post: <a...,How to derive categorical cross entropy update...,<logistic><cross-entropy>
3,478542,2,<p>I have a Poisson distributed glm where I ha...,"Learning more about glm parameters, how to dig...",<generalized-linear-model><interpretation>
4,458388,0,<p>1) how do i decide which transformation or ...,Is there I guide to decide which transformatio...,<python><data-transformation><dataset><feature...


In [4]:
stack_data['UsefulQuestion'] = (stack_data['Score'] >= 1).astype('int32').astype('object')

In [5]:
stack_data['UsefulQuestion']

0       1
1       0
2       1
3       1
4       0
       ..
8244    0
8245    1
8246    1
8247    1
8248    1
Name: UsefulQuestion, Length: 27496, dtype: object

### To extract text and remove tag

In [6]:
#create a function that turns html text into plain text
def html_text(table, column):
    plain_html = []
    for index, row in column.iteritems():
        each = BeautifulSoup(row)
        plain_html.append(each.get_text())
        #print(column.row)
    table['Plain_Text'] = plain_html
    return table.head()

In [7]:
html_text(stack_data, stack_data['Body'])

Unnamed: 0,Id,Score,Body,Title,Tags,UsefulQuestion,Plain_Text
0,495560,1,<p>I have a set of data that I am transforming...,R: emmeans back tranform clr data using clrInv,<r><mixed-model><linear><lsmeans>,1,I have a set of data that I am transforming us...
1,489896,0,<p>We are sending a one bit message to someone...,Trying to determine the failure rate of redund...,<probability><python>,0,We are sending a one bit message to someone. ...
2,497951,2,<p>I am aware that there is a similar post: <a...,How to derive categorical cross entropy update...,<logistic><cross-entropy>,1,I am aware that there is a similar post: Vecto...
3,478542,2,<p>I have a Poisson distributed glm where I ha...,"Learning more about glm parameters, how to dig...",<generalized-linear-model><interpretation>,1,I have a Poisson distributed glm where I have ...
4,458388,0,<p>1) how do i decide which transformation or ...,Is there I guide to decide which transformatio...,<python><data-transformation><dataset><feature...,0,1) how do i decide which transformation or sca...


### To remove the \n from the body text

In [8]:
import re
# re has expressions to search and manipulate strings


column1 = []
for values in stack_data['Plain_Text']:
    column1.append(re.sub('\\n', ' ', values))
stack_data['Plainer_Text'] = column1

column2 = []
for values in stack_data['Plainer_Text']:
    column2.append(re.sub('\#', ' ', values))
stack_data['Plainer_Text'] = column2

column3 = []
for values in stack_data['Plainer_Text']:
    column3.append(re.sub('(\$.*?\$)', ' ', values))
stack_data['Plainer_Text'] = column3

### Text Cleaning: Body column

In [9]:
bodytext = stack_data['Plainer_Text']

In [10]:
#changing to lowercase
body_lowercase_test = bodytext.str.lower()

In [11]:
#removing punctuation marks
from string import punctuation

def remove_punctuation(document):
    no_punct = ''.join([character for character in document if character not in punctuation])
    return no_punct

text_body_no_punct = body_lowercase_test.apply(remove_punctuation)

In [12]:
import nltk

In [13]:
#remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anmol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
from nltk.tokenize import word_tokenize

tokenized_text_body = text_body_no_punct.apply(word_tokenize)
tokenized_text_body

0       [i, have, a, set, of, data, that, i, am, trans...
1       [we, are, sending, a, one, bit, message, to, s...
2       [i, am, aware, that, there, is, a, similar, po...
3       [i, have, a, poisson, distributed, glm, where,...
4       [1, how, do, i, decide, which, transformation,...
                              ...                        
8244    [my, data, is, of, the, form, where, i, denote...
8245    [i, noticed, the, term, anova, used, in, many,...
8246    [im, trying, to, do, logistic, regression, but...
8247    [consider, the, following, experimental, desig...
8248    [i, am, constructing, different, configuration...
Name: Plainer_Text, Length: 27496, dtype: object

In [15]:
def remove_stopwords(document):
    words = [word for word in document if not word in stop_words]
    return words
body_no_stopword_text = tokenized_text_body.apply(remove_stopwords)

In [16]:
#stemming

In [17]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def stemmer(document):
    stemmed_document = [porter.stem(word) for word in document]
    return stemmed_document

In [18]:
text_body_stemmed = body_no_stopword_text.apply(stemmer)
text_body_stemmed

0       [set, data, transform, use, clr, function, lib...
1       [send, one, bit, messag, someon, 60, chanc, me...
2       [awar, similar, post, vector, cross, entropi, ...
3       [poisson, distribut, glm, identifi, origin, pa...
4       [1, decid, transform, scale, use, pass, data, ...
                              ...                        
8244    [data, form, denot, compon, independ, variabl,...
8245    [notic, term, anova, use, mani, context, one, ...
8246    [im, tri, logist, regress, cant, seem, get, re...
8247    [consid, follow, experiment, design, withinsub...
8248    [construct, differ, configur, random, forest, ...
Name: Plainer_Text, Length: 27496, dtype: object

### Document term matrix- Body

In [19]:
#detokenization
from nltk.tokenize.treebank import TreebankWordDetokenizer
text_body_detokenized = text_body_stemmed.apply(TreebankWordDetokenizer().detokenize)

In [20]:
#creating dtm
#seting min_df to 0.05 to preserve as many words as possible and elimination of irrelevant number

from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(min_df=0.01)

sparse_dtm_body = countvec.fit_transform(text_body_detokenized)
sparse_dtm_body

<27496x908 sparse matrix of type '<class 'numpy.int64'>'
	with 1042114 stored elements in Compressed Sparse Row format>

In [21]:
dtm_body = pd.DataFrame(sparse_dtm_body.toarray(), columns=countvec.get_feature_names_out(), index=stack_data.index)
dtm_body.head()

Unnamed: 0,0001,001,005,01,02,05,10,100,1000,10000,...,written,wrong,x1,x2,xi,ye,year,yet,yield,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Text Cleaning Title of the question

In [22]:
title_text = stack_data['Title']

In [23]:
#change to lowercase
title_text_lowercase = title_text.str.lower()

In [24]:
#remove punctuation
title_text_no_punct = title_text_lowercase.apply(remove_punctuation)

In [25]:
#tokenizing the text
title_text_tokenized = title_text_no_punct.apply(word_tokenize)
title_text_tokenized.head()

0    [r, emmeans, back, tranform, clr, data, using,...
1    [trying, to, determine, the, failure, rate, of...
2    [how, to, derive, categorical, cross, entropy,...
3    [learning, more, about, glm, parameters, how, ...
4    [is, there, i, guide, to, decide, which, trans...
Name: Title, dtype: object

In [26]:
#removing all stop words from the text
title_no_stopwords = title_text_tokenized.apply(remove_stopwords)

In [27]:
#stemming the text present in title
title_text_stemmed = title_no_stopwords.apply(stemmer)
title_text_stemmed

0       [r, emmean, back, tranform, clr, data, use, cl...
1       [tri, determin, failur, rate, redundantli, sen...
2       [deriv, categor, cross, entropi, updat, rule, ...
3                      [learn, glm, paramet, dig, deeper]
4       [guid, decid, transform, choos, differ, scenar...
                              ...                        
8244                    [visualis, high, dimension, data]
8245    [analysi, residu, varianc, still, anova, regre...
8246                 [handl, miss, data, logist, regress]
8247    [mix, model, treat, random, factor, nest, with...
8248                        [data, partit, spatial, data]
Name: Title, Length: 27496, dtype: object

### Document Term Matrix - Title

In [28]:
title_text_detokenized = title_text_stemmed.apply(TreebankWordDetokenizer().detokenize)

In [29]:
countvec = CountVectorizer(min_df=0.01)

sparse_dtm_title = countvec.fit_transform(title_text_detokenized)
sparse_dtm_title

<27496x109 sparse matrix of type '<class 'numpy.int64'>'
	with 74395 stored elements in Compressed Sparse Row format>

In [30]:
dtm_title = pd.DataFrame(sparse_dtm_title.toarray(), columns=countvec.get_feature_names_out(), index=stack_data.index)
dtm_title.head()

Unnamed: 0,algorithm,analysi,anova,base,bayesian,best,binari,binomi,calcul,categor,...,two,understand,use,valid,valu,variabl,varianc,vs,way,weight
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Text Cleaning: Tags

In [31]:
tags_text = stack_data['Tags']

In [32]:
#change to lowercase
text_tags_lowercase = tags_text.str.lower()
text_tags_lowercase.head()

0                    <r><mixed-model><linear><lsmeans>
1                                <probability><python>
2                            <logistic><cross-entropy>
3           <generalized-linear-model><interpretation>
4    <python><data-transformation><dataset><feature...
Name: Tags, dtype: object

In [33]:
#remove "< >"
import re

new_tags = []
for values in text_tags_lowercase:
    new_tags.append(re.findall('<(.*?)>', values))
stack_data['Tags'] = new_tags

stack_data['Tags']

0                       [r, mixed-model, linear, lsmeans]
1                                   [probability, python]
2                               [logistic, cross-entropy]
3              [generalized-linear-model, interpretation]
4       [python, data-transformation, dataset, feature...
                              ...                        
8244                     [r, data-visualization, ggplot2]
8245    [regression, anova, generalized-linear-model, ...
8246    [r, regression, logistic, missing-data, regres...
8247                          [r, mixed-model, lme4-nlme]
8248    [machine-learning, random-forest, spatial, par...
Name: Tags, Length: 27496, dtype: object

In [34]:
#remove punctuation
tag_text_no_punct = text_tags_lowercase.apply(remove_punctuation)

In [35]:
#tokenizing the text
tag_text_tokenized = tag_text_no_punct.apply(word_tokenize)
tag_text_tokenized.head()

0                           [rmixedmodellinearlsmeans]
1                                  [probabilitypython]
2                               [logisticcrossentropy]
3               [generalizedlinearmodelinterpretation]
4    [pythondatatransformationdatasetfeatureenginee...
Name: Tags, dtype: object

In [36]:
#removing all stop words from the text
tag_no_stopwords = tag_text_tokenized.apply(remove_stopwords)

In [37]:
#stemming the text present in title
tag_text_stemmed = tag_no_stopwords.apply(stemmer)
tag_text_stemmed

0                               [rmixedmodellinearlsmean]
1                                     [probabilitypython]
2                                  [logisticcrossentropi]
3                       [generalizedlinearmodelinterpret]
4       [pythondatatransformationdatasetfeatureenginee...
                              ...                        
8244                          [rdatavisualizationggplot2]
8245    [regressionanovageneralizedlinearmodelmodeling...
8246    [rregressionlogisticmissingdataregressionstrat...
8247                                 [rmixedmodellme4nlm]
8248    [machinelearningrandomforestspatialpartitionin...
Name: Tags, Length: 27496, dtype: object

### Document Term Matrix - Tags


In [38]:
tag_text_detokenized = tag_text_stemmed.apply(TreebankWordDetokenizer().detokenize)

In [39]:
countvec = CountVectorizer(min_df=0.0001)

sparse_dtm_tag = countvec.fit_transform(tag_text_detokenized)
sparse_dtm_tag

<27496x784 sparse matrix of type '<class 'numpy.int64'>'
	with 4866 stored elements in Compressed Sparse Row format>

In [40]:
dtm_tag = pd.DataFrame(sparse_dtm_tag.toarray(), columns=countvec.get_feature_names_out(), index=stack_data.index)
dtm_tag.head()

Unnamed: 0,agreementstatist,agreementstatisticscohenskappa,aic,aicbic,anova,anovacontrast,anovainteract,anovalme4nlm,anovamixedmodel,anovaposthoc,...,tsne,ttest,ttestpaireddata,uncertaintyerrorpropag,varianc,variancecovari,varianceleastsquar,variancestandarddevi,wilcoxonmannwhitneytest,wilcoxonsignedrank
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
dtm_body = dtm_body.add_suffix('_body')
dtm_title = dtm_title.add_suffix('_title')
dtm_tag = dtm_tag.add_suffix('_tags')

In [42]:
body_title_joined = dtm_body.join(dtm_title)
body_title_tag_joined = body_title_joined.join(dtm_tag)


In [43]:
#impp
body_title_tag_joined

Unnamed: 0,0001_body,001_body,005_body,01_body,02_body,05_body,10_body,100_body,1000_body,10000_body,...,tsne_tags,ttest_tags,ttestpaireddata_tags,uncertaintyerrorpropag_tags,varianc_tags,variancecovari_tags,varianceleastsquar_tags,variancestandarddevi_tags,wilcoxonmannwhitneytest_tags,wilcoxonsignedrank_tags
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19242,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
stack_data_processed = stack_data[['Id','Score','UsefulQuestion']].join(body_title_tag_joined)
stack_data_processed.head()

Unnamed: 0,Id,Score,UsefulQuestion,0001_body,001_body,005_body,01_body,02_body,05_body,10_body,...,tsne_tags,ttest_tags,ttestpaireddata_tags,uncertaintyerrorpropag_tags,varianc_tags,variancecovari_tags,varianceleastsquar_tags,variancestandarddevi_tags,wilcoxonmannwhitneytest_tags,wilcoxonsignedrank_tags
0,495560,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,495560,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,495560,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,495560,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,495560,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
stack_data_processed.to_csv('stack_data_processed.csv')

In [46]:
training_set=stack_data_processed[0:19249]

In [47]:
testing_set=stack_data_processed[19249:]

In [48]:
feature_cols = training_set.columns[3:621]
x_train = training_set[feature_cols]
x_test = testing_set[feature_cols]
y_train = training_set['UsefulQuestion']
y_test = testing_set['UsefulQuestion']

x_train=x_train.to_numpy().astype('int32')
y_train=y_train.to_numpy().astype('int32')

MemoryError: Unable to allocate 1.66 GiB for an array with shape (1803, 123733) and data type int64

## Part b

In [None]:
!pip install statsmodels==0.13.0
import os
import statsmodels.api as sm
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split

y = stack_data_processed['UsefulQuestion'][0:76990].astype('int32')
X = body_title_tag_joined

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=88)
X_train.shape, X_test.shape



In [None]:
X.shape, y.shape

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=88)
logreg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
y_prob = logreg.predict_proba(X_test)
y_pred = pd.Series([1 if x > 0.5 else 0 for x in y_prob[:,1]], index=y_test.index)

cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

In [None]:
y_pred_lda = lda.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
lda_confusion = confusion_matrix(y_test, y_pred_lda).ravel()

In [None]:
accuracy_score(y_test, y_pred_lda)

In [None]:
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
#cross-validation to get the optimal ccp alpha
#fits reduced from 2010 (demonstrated in lab) to 110 & max_depth from 30 to 20 for sake of time


grid_values = {'ccp_alpha':np.linspace(0, 0.10, 11),
               'min_samples_leaf': [5],
               'min_samples_split': [20],
               'max_depth': [20],
               'random_state': [88]}

dtc = DecisionTreeClassifier()
dtc_cv_acc = GridSearchCV(dtc, param_grid = grid_values, cv = 10, verbose = 1, scoring = 'accuracy')
dtc_cv_acc.fit(X_train, y_train)

In [None]:
#derive the top 10 ccp_alpha values
acc = dtc_cv_acc.cv_results_['mean_test_score']
ccp = dtc_cv_acc.cv_results_['param_ccp_alpha'].data

pd.DataFrame({'ccp lpha': ccp, 'Validation Accuracy': acc}).head(10)

Boosting

In [None]:
import time

def bootstrap_validation(test_data, test_label, train_label, model, metrics_list, sample=500, random_state=66):
    tic = time.time()
    n_sample = sample
    n_metrics = len(metrics_list)
    output_array=np.zeros([n_sample, n_metrics])
    output_array[:]=np.nan
    print(output_array.shape)
    for bs_iter in range(n_sample):
        bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True)
        bs_data = test_data.loc[bs_index]
        bs_label = test_label.loc[bs_index]
        bs_predicted = model.predict(bs_data)
        for metrics_iter in range(n_metrics):
            metrics = metrics_list[metrics_iter]
            output_array[bs_iter, metrics_iter]=metrics(bs_predicted,bs_label,train_label)
#         if bs_iter % 100 == 0:
#             print(bs_iter, time.time()-tic)
    output_df = pd.DataFrame(output_array)
    return output_df

In [None]:
def tpr(predictions):
    tn, fp, fn, tp = confusion_matrix(y_test, predictions)
    return tp/(tp + fn)

def fpr(predictions):
    tn, fp, fn, tp = confusion_matrix(y_test, predictions)
    return fp/(fp + tn)

In [None]:
bs_output_logreg = bootstrap_validation(X_test, y_test,y_train, logreg, metrics_list =[tpr], sample = 500)