## Q4.1.1

#### Packages and functions

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import re, string
from collections import Counter
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
import gensim
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from scipy.stats.stats import pearsonr
import readability

In [2]:
# remove punctuation, multiple spaces and stopwords
def data_preprocess(input_df, stop_word = True):
    new_wopunc = input_df.str.replace('[^\w\s]',' ')
    if stop_word:
        stop_words = r'\b(?:{})\b'.format('|'.join(set(stopwords.words('english'))))
        new_wostw = new_wopunc.str.lower().str.replace(stop_words, ' ')
        New = new_wostw.str.replace('\s+', ' ', regex=True)
    else:
        New = new_wopunc.str.lower().str.replace('\s+', ' ', regex=True)
    return New

In [11]:
# generate the features
# Maximum repetition of unigrams
def max_rep_uni(input_s):
    max_rep_uni= pd.Series(map(lambda x: max(Counter(x).values()), filter(None, input_s.str.split(' '))))
    return max_rep_uni

In [12]:
# generate the features
# Maximum repetition of bigrams
def max_rep_bi(input_s):
    max_rep_bi = pd.Series(map(lambda x: max(Counter(list(nltk.bigrams(x))).values()), filter(None, input_s.str.split(' '))))
    return max_rep_bi

In [6]:
# function to generate word2vec representation of the reviews 
def word2vec_embedding(input_s, vec_pre):
    vec = []
    for i in range(len(input_s)):
        l = [w for w in input_s[i].split(' ') if w in vec_pre]
        if len(l)>0:
            vec_l = []
            for j in range(len(l)):
                vec_l.append(vec_pre[l[j]])
            vec_l = sum(vec_l)/len(vec_l)
            vec.append(vec_l)
    return pd.Series(vec)

In [13]:
# generate the features
# Maximum Similarity of sentences
def max_sim(input_s):
    l = pd.Series(map(lambda x:pd.Series( nltk.sent_tokenize(x)), input_s))
    sos = pd.Series(map(data_preprocess,l))
    l_0 = list(map(lambda x: pd.Series(x.replace(' ', np.nan, inplace=True)), sos))
    sos_cl = pd.Series(map(lambda x: pd.Series(x.dropna().reset_index(drop=True)), sos))
    word_rep = pd.Series(map(lambda x: word2vec_embedding(x, word2vec), sos_cl))
    max_s = []
    for i in range(len(word_rep)):
        
        if len(word_rep[i]) == 1:
            max_s.append(1)
        else:
            cos = []
            for j in range(len(word_rep[i])):
                for k in range(len(word_rep[i])):
                    if j != k:
                        cos.append(cosine_similarity([word_rep[i][j],word_rep[i][k]])[0][1])
            max_s.append(max(cos))
    return max_s

#### Data Preprocess and Feature Generation

In [9]:
#Read files
train_df=pd.read_csv("./Train_Data.csv")
test_df=pd.read_csv("./Test_Data.csv",encoding='iso-8859-1')

In [15]:
#Unzip the file to word directory
word2vec = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)

In [16]:
train_df['New Summary'] = data_preprocess(train_df['Summary'])
train_df['max_rep_uni'] = max_rep_uni(train_df['New Summary'])
train_df['max_rep_bi'] = max_rep_bi(train_df['New Summary'])
train_df['max_sim'] = max_sim(train_df['Summary'])

In [17]:
test_df['New Summary'] = data_preprocess(test_df['Summary'])
test_df['max_rep_uni'] = max_rep_uni(test_df['New Summary'])
test_df['max_rep_bi'] = max_rep_bi(test_df['New Summary'])
test_df['max_sim'] = max_sim(test_df['Summary'])

#### Modeling

In [19]:
reg = LinearRegression().fit(train_df[['max_rep_uni','max_rep_bi', 'max_sim']], train_df['Non-Redundancy'])

In [20]:
pred = reg.predict(test_df[['max_rep_uni','max_rep_bi', 'max_sim']])

In [21]:
mse = mean_squared_error(test_df['Non-Redundancy'], pred)
print("MSE is %s." % mse)

MSE is 0.21356612418283025.


In [23]:
p_cor = pearsonr(test_df['Non-Redundancy'], pred)
print("Pearson coefficient is %s." % p_cor[0])

Pearson coefficient is 0.6880791212152372.


## Q4.1.2

#### Functions

In [24]:
# generate the features
# Number of stopwords
def count_stw(l):
    c = 0
    for i in range(len(l)):
        if l[i] in set(stopwords.words('english')):
            c += 1
    return c

def num_stw(input_s):
    numstw = pd.Series(map(count_stw,filter(None, input_s.str.split(' '))))
    return numstw

In [25]:
# Number of unique words
def num_uni(input_s):
    num_uni = pd.Series(map(lambda x:len(Counter(x)),filter(None, input_s.str.split(' '))))
    return num_uni

#### Generate features 1

In [27]:
train_df['New_Summary_w_sw'] = data_preprocess(train_df['Summary'], stop_word = False)
test_df['New_Summary_w_sw'] = data_preprocess(test_df['Summary'], stop_word = False)
train_df['numstw'] = num_stw(train_df['New_Summary_w_sw'])
test_df['numstw'] = num_stw(test_df['New_Summary_w_sw'])

#### Modeling 1

In [67]:
reg = LinearRegression().fit(train_df[['max_rep_uni','max_rep_bi', 'max_sim','numstw']], train_df['Non-Redundancy'])

In [68]:
pred = reg.predict(test_df[['max_rep_uni','max_rep_bi', 'max_sim','numstw']])

In [30]:
mse = mean_squared_error(test_df['Non-Redundancy'], pred)
print("MSE is %s." % mse)

MSE is 0.22434490250489547.


In [32]:
p_cor = pearsonr(test_df['Non-Redundancy'], pred)
print("Pearson coefficient is %s." % p_cor[0])

Pearson coefficient is 0.6660487180073126.


In [70]:
reg.coef_

array([-0.0730213 , -0.1701772 , -1.86224303,  0.00975268])

The model was not improved when adding Number of stopwords. 
The Number of stopwords was expected to have negative impact on non-redundency. 
But the model shows a very small positive coefficient for this feature and model was not improved.

#### Generate features 2

In [33]:
train_df['numuni'] = num_uni(train_df['New Summary'])
test_df['numuni'] = num_uni(test_df['New Summary'])

#### Modeling 2

In [71]:
reg = LinearRegression().fit(train_df[['max_rep_uni','max_rep_bi', 'max_sim','numuni']], train_df['Non-Redundancy'])

In [72]:
pred = reg.predict(test_df[['max_rep_uni','max_rep_bi', 'max_sim','numuni']])

In [73]:
mse = mean_squared_error(test_df['Non-Redundancy'], pred)
print("MSE is %s." % mse)

MSE is 0.19838644426870203.


In [74]:
p_cor = pearsonr(test_df['Non-Redundancy'], pred)
print("Pearson coefficient is %s." % p_cor[0])

Pearson coefficient is 0.7089801217174854.


In [75]:
reg.coef_

array([-0.07959178, -0.1436767 , -1.6476043 ,  0.01932475])

The model was improved when adding Number of unique words. 

MSE from 0.214 to 0.198

P_cor from 0.688 to 0.709

This is reasonable because the more uniques words used, the more meaningful and the less redundency the summary is.

## Q4.2.1

#### Functions

In [39]:
#Generate feature
#Total number of repetitive unigrams
def count_rep_uni(input_list):
    c=0
    for i in range(len(input_list)-1):
        if input_list[i] == input_list[i+1]:
            c +=1
    return c

def rep_uni(input_s):
    l = list(filter(None, input_s.str.split(' ')))
    rep_uni = pd.Series(list(map(count_rep_uni, l)))
    return rep_uni

In [40]:
# Total number of repetitive bigrams
def count_rep_bi(input_list):
    c=0
    for i in range(len(input_list)-2):
        if input_list[i] == input_list[i+2]:
            c +=1
    return c

def rep_bi(input_s):
    l = list(map(lambda x: list(nltk.bigrams(x)), filter(None, input_s.str.split(' '))))
    rep_bi = pd.Series(list(map(count_rep_bi, l)))
    return rep_bi

In [41]:
# readability score 
def read_score(input_s):
    l = pd.Series(map(lambda x:pd.Series( nltk.sent_tokenize(x)), input_s))
    sos = pd.Series(map(data_preprocess,l))
    l_0 = list(map(lambda x: pd.Series(x.replace(' ', np.nan, inplace=True)), sos))
    sos_cl = pd.Series(map(lambda x: pd.Series(x.dropna().reset_index(drop=True)), sos))
    read_score = []
    for i in range(len(sos_cl)):
        sc = min(list(map(lambda x: readability.getmeasures(x, lang='en')['readability grades']['FleschReadingEase'], sos_cl[i])))
        read_score.append(sc)
    return read_score

#### Generate features

In [42]:
train_df['rep_uni'] = rep_uni(train_df['New_Summary_w_sw'])
train_df['rep_bi'] = rep_bi(train_df['New_Summary_w_sw'])
train_df['read_score'] = read_score(train_df['Summary'])

In [43]:
test_df['rep_uni'] = rep_uni(test_df['New_Summary_w_sw'])
test_df['rep_bi'] = rep_bi(test_df['New_Summary_w_sw'])
test_df['read_score'] = read_score(test_df['Summary'])

#### Modeling

In [45]:
reg_fl = LinearRegression().fit(train_df[['rep_uni', 'rep_bi', 'read_score']], train_df['Fluency'])

In [46]:
pred_fl = reg_fl.predict(test_df[['rep_uni', 'rep_bi', 'read_score']])

In [47]:
mse = mean_squared_error(test_df['Fluency'], pred_fl)
print("MSE is %s." % mse)

MSE is 0.24180648384915657.


In [49]:
p_cor = pearsonr(test_df['Fluency'], pred_fl)
print("Pearson coefficient is %s." % p_cor[0])

Pearson coefficient is 0.28167521957654607.


## Q4.2.2

#### Functions

In [50]:
# number of sentences
def num_sen(input_s):
    l = pd.Series(map(lambda x:pd.Series( nltk.sent_tokenize(x)), input_s))
    num_sen = pd.Series(map(len,l))
    return num_sen

In [51]:
# count preposition from a pos_tag list
def count_p(inputlist):
    c = 0
    for i in range(len(inputlist)):
        if inputlist[i][1][0:2] == 'IN':
            c += 1
    return c

In [53]:
# num of Preps
def num_prep(input_s):
    l = pd.Series(map(lambda x:pd.Series(nltk.pos_tag(nltk.word_tokenize(x))), input_s))
    num_verb = pd.Series(map(count_p,l))
    return num_verb

#### Generate features 1

In [54]:
train_df['numsen'] = num_sen(train_df['Summary'])
test_df['numsen'] = num_sen(test_df['Summary'])

#### Modeling 1

In [55]:
reg_fl = LinearRegression().fit(train_df[['rep_uni', 'rep_bi','read_score','numsen']], train_df['Fluency'])

In [56]:
pred_fl = reg_fl.predict(test_df[['rep_uni', 'rep_bi','read_score','numsen']])

In [57]:
mse = mean_squared_error(test_df['Fluency'], pred_fl)
print("MSE is %s." % mse)

MSE is 0.24771596926819606.


In [59]:
p_cor = pearsonr(test_df['Fluency'], pred_fl)
print("Pearson coefficient is %s." % p_cor[0])

Pearson coefficient is 0.30348022383819573.


In [78]:
reg_fl.coef_

array([-0.16522268, -0.02911817,  0.00020206,  0.02909331])

The model was not improved when adding number of sentences. 
The number of sentences was expected to have positive impact on fluency. 
The model shows a positive coefficient for this feature but the model was not improved.

#### Generate features 2

In [60]:
train_df['numprep'] = num_prep(train_df['New Summary'])
test_df['numprep'] = num_prep(test_df['New Summary'])

#### Modeling 2

In [61]:
reg_fl = LinearRegression().fit(train_df[['rep_uni', 'rep_bi','read_score','numprep']], train_df['Fluency'])

In [62]:
pred_fl = reg_fl.predict(test_df[['rep_uni', 'rep_bi','read_score','numprep']])

In [63]:
mse = mean_squared_error(test_df['Fluency'], pred_fl)
print("MSE is %s." % mse)

MSE is 0.2391553399924169.


In [65]:
p_cor = pearsonr(test_df['Fluency'], pred_fl)
print("Pearson coefficient is %s." % p_cor[0])

Pearson coefficient is 0.3004648103413536.


The model was improved when adding num of Prepositions. 

MSE from 0.242 to 0.239

P_cor from 0.282 to 0.300

This is reasonable because the more prepositions used, the more fluent the sentences are.