## Data preparation

In [1]:
import csv
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/Project')
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive/Colab Notebooks/Project'

In [2]:
import pandas as pd
import string
import re
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

In [3]:
def pre_processing(df: pd.DataFrame) -> pd.DataFrame:
    df['genre'] = df['genre'].replace('main-', '', regex=True)
    df['genre'] = df['genre'].replace('forum', 'forums')
    df['year'] = df['year'].replace(r'\D', '', regex=True)
    df['score'] = MinMaxScaler().fit_transform(df[['score']])
    return df

In [4]:
path_train = 'data/sts-train.csv'
path_dev = 'data/sts-dev.csv'
path_test = 'data/sts-test.csv'

In [5]:
columns=['genre', 'file', 'year', 'index', 'score', 'sentence1', 'sentence2']

In [6]:
df_train = pd.read_csv(path_train, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_dev = pd.read_csv(path_dev, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_test = pd.read_csv(path_test, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')

In [7]:
string.punctuation

regex = '[' + string.punctuation + ']'

In [8]:
df_train = pre_processing(df_train)
df_dev = pre_processing(df_dev)
df_test = pre_processing(df_test)

In [9]:
print(len(df_train))
print(len(df_dev))
print(len(df_test))

5749
1500
1379


In [10]:
df = pd.concat([df_train, df_dev, df_test]).reset_index(drop=True)
df.head(10)

Unnamed: 0,genre,file,year,index,score,sentence1,sentence2
0,captions,MSRvid,2012,1,1.0,A plane is taking off.,An air plane is taking off.
1,captions,MSRvid,2012,4,0.76,A man is playing a large flute.,A man is playing a flute.
2,captions,MSRvid,2012,5,0.76,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,captions,MSRvid,2012,6,0.52,Three men are playing chess.,Two men are playing chess.
4,captions,MSRvid,2012,9,0.85,A man is playing the cello.,A man seated is playing the cello.
5,captions,MSRvid,2012,11,0.85,Some men are fighting.,Two men are fighting.
6,captions,MSRvid,2012,12,0.1,A man is smoking.,A man is skating.
7,captions,MSRvid,2012,13,0.32,The man is playing the piano.,The man is playing the guitar.
8,captions,MSRvid,2012,14,0.44,A man is playing on a guitar and singing.,A woman is playing an acoustic guitar and sing...
9,captions,MSRvid,2012,16,1.0,A person is throwing a cat on to the ceiling.,A person throws a cat on the ceiling.


## TF-IDF bag-of-words vectors

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
from nltk.corpus import stopwords
for i in range(0,len(df_train)):
  df_train['sentence1'][i] = re.sub(regex,' ',df_train['sentence1'][i])
  df_train['sentence2'][i] = re.sub(regex,' ',df_train['sentence2'][i])
  words1 = df_train['sentence1'][i].lower().split()
  words2 = df_train['sentence2'][i].lower().split()
  df_train['sentence1'][i] = ' '.join([w for w in words1 if w not in stopwords.words('english')])
  df_train['sentence2'][i] = ' '.join([w for w in words2 if w not in stopwords.words('english')])

for i in range(0,len(df_test)):
  df_test['sentence1'][i] = re.sub(regex,' ',df_test['sentence1'][i])
  df_test['sentence2'][i] = re.sub(regex,' ',df_test['sentence2'][i])
  words1 = df_test['sentence1'][i].lower().split()
  words2 = df_test['sentence2'][i].lower().split()
  df_test['sentence1'][i] = ' '.join([w for w in words1 if w not in stopwords.words('english')])
  df_test['sentence2'][i] = ' '.join([w for w in words2 if w not in stopwords.words('english')])


for i in range(0,len(df_dev)):
  df_dev['sentence1'][i] = re.sub(regex,' ',df_dev['sentence1'][i])
  df_dev['sentence2'][i] = re.sub(regex,' ',df_dev['sentence2'][i])
  words1 = df_dev['sentence1'][i].lower().split()
  words2 = df_dev['sentence2'][i].lower().split()
  df_dev['sentence1'][i] = ' '.join([w for w in words1 if w not in stopwords.words('english')])
  df_dev['sentence2'][i] = ' '.join([w for w in words2 if w not in stopwords.words('english')])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['sentence1'][i] = re.sub(regex,' ',df_train['sentence1'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['sentence2'][i] = re.sub(regex,' ',df_train['sentence2'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['sentence1'][i] = ' '.join([w for w in words1 if w not in stopwords.words('english')])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-do

In [13]:
dict = df.to_dict('records')
dict[0]

{'genre': 'captions',
 'file': 'MSRvid',
 'year': '2012',
 'index': 1,
 'score': 1.0,
 'sentence1': 'A plane is taking off.',
 'sentence2': 'An air plane is taking off.'}

In [14]:
document = [f"{sample['sentence1']} \n\n {sample['sentence2']}" for sample in dict]
document[0]

'A plane is taking off. \n\n An air plane is taking off.'

In [15]:
document1 = [f"{sample['sentence1']}" for sample in dict]
document1[0]

'A plane is taking off.'

In [16]:
document2 = [f"{sample['sentence2']}" for sample in dict]
document2[0]

'An air plane is taking off.'

In [17]:
scores = [sample['score'] for sample in dict]
len(scores)

8628

POS TAGGING

In [18]:
import spacy
import en_core_web_sm
nlp_model = en_core_web_sm.load()

In [19]:
doc_pos=[]
string= ""

for i in range(0,len(df)):
  parsed_text = nlp_model(document[i])
  doc_pos.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))



In [20]:
doc_pos1=[]
string= ""

for i in range(0,len(df)):
  parsed_text = nlp_model(document1[i])
  doc_pos1.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))


In [21]:
doc_pos2=[]
string= ""

for i in range(0,len(df)):
  parsed_text = nlp_model(document2[i])
  doc_pos2.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))


#Create vectorizer with unigrams

In [22]:
pattern = r'\([A-Za-z0-9]+,\s\'[A-Z]+\'\)'
pattern

"\\([A-Za-z0-9]+,\\s\\'[A-Z]+\\'\\)"

In [23]:
vectorizer = TfidfVectorizer(analyzer= 'word', token_pattern=pattern, max_df=0.8, min_df=4, lowercase=False)
vectorizer.fit(doc_pos)

In [24]:
vocab = vectorizer.get_feature_names_out()
print(f"Length of vocabulary: {len(vocab)}")

Length of vocabulary: 4473


In [25]:
vector_document1 = vectorizer.transform(doc_pos1)
vector_document1

<8628x4473 sparse matrix of type '<class 'numpy.float64'>'
	with 71777 stored elements in Compressed Sparse Row format>

In [26]:
sorted([(vocab[j], vector_document1[0, j]) for j in vector_document1[0].nonzero()[1]], key=lambda x: -x[1])

[("(taking, 'VERB')", 0.5777384565895795),
 ("(plane, 'NOUN')", 0.5628656524411488),
 ("(off, 'ADP')", 0.5129086521156572),
 ("(is, 'AUX')", 0.20964536098607556),
 ("(A, 'DET')", 0.20584963026784428)]

In [27]:
vector_document2 = vectorizer.transform(doc_pos2)
vector_document2

<8628x4473 sparse matrix of type '<class 'numpy.float64'>'
	with 71576 stored elements in Compressed Sparse Row format>

In [28]:
sorted([(vocab[j], vector_document2[0, j]) for j in vector_document2[0].nonzero()[1]], key=lambda x: -x[1])

[("(taking, 'VERB')", 0.46105413361927433),
 ("(plane, 'NOUN')", 0.4491851507725688),
 ("(air, 'NOUN')", 0.4433159665885308),
 ("(An, 'DET')", 0.4400012229988362),
 ("(off, 'ADP')", 0.4093178349645617),
 ("(is, 'AUX')", 0.16730383649257402)]

Compute similarity

In [29]:
vec = pd.DataFrame(columns=['Similarity','Normalized Score'])

for i in range(0,len(df)):
  new_row = {'Similarity': vector_document1[i].multiply(vector_document2[i]).sum(), 'Normalized Score': df['score'][i]}
  vec.loc[len(vec)] = new_row


#scaler = MinMaxScaler()
#vec['Normalized Score'] = scaler.fit_transform(vec[['Normalized Score']])

In [30]:
vec.head(10)

Unnamed: 0,Similarity,Normalized Score
0,0.764217,1.0
1,0.845159,0.76
2,0.956545,0.76
3,0.70887,0.52
4,0.60845,0.85
5,0.745895,0.85
6,0.226573,0.1
7,0.586443,0.32
8,0.673334,0.44
9,0.639271,1.0


# Extending the BOW vector to include n-grams

Include bigrams

In [31]:
vectorizer_bi = TfidfVectorizer(analyzer= 'word', max_df=0.8, min_df=4, stop_words="english", ngram_range=(1,2))
vectorizer_bi.fit(document)

In [32]:
vocab_bi = vectorizer_bi.get_feature_names_out()
print(f"Length of vocabulary: {len(vocab_bi)}")

Length of vocabulary: 5436


In [33]:
vector_document1_bi = vectorizer_bi.transform(document1)

In [34]:
vector_document2_bi = vectorizer_bi.transform(document2)

In [35]:
vec_bi = pd.DataFrame(columns=['Similarity','Normalized Score'])

for i in range(0,len(df)):
  new_row = {'Similarity': vector_document1_bi[i].multiply(vector_document2_bi[i]).sum(), 'Normalized Score': df['score'][i]}
  vec_bi.loc[len(vec_bi)] = new_row

#vec_bi['Normalized Score'] = scaler.fit_transform(vec_bi[['Normalized Score']])

In [36]:
vec_bi.head(10)

Unnamed: 0,Similarity,Normalized Score
0,0.833343,1.0
1,0.715372,0.76
2,0.806057,0.76
3,1.0,0.52
4,0.285876,0.85
5,1.0,0.85
6,0.093948,0.1
7,0.436423,0.32
8,0.537395,0.44
9,0.625417,1.0


Include trigrams

In [37]:
vectorizer_tri = TfidfVectorizer(analyzer= 'word', max_df=0.8, min_df=4, stop_words="english", ngram_range=(1,3))
vectorizer_tri.fit(document)

In [38]:
vocab_tri = vectorizer_tri.get_feature_names_out()
print(f"Length of vocabulary: {len(vocab_tri)}")

Length of vocabulary: 5976


In [39]:
vector_document1_tri = vectorizer_tri.transform(document1)

In [40]:
vector_document2_tri = vectorizer_tri.transform(document2)

In [41]:
vec_tri = pd.DataFrame(columns=['Similarity','Normalized Score'])

for i in range(0,len(df)):
  new_row = {'Similarity': vector_document1_tri[i].multiply(vector_document2_tri[i]).sum(), 'Normalized Score': df['score'][i]}
  vec_tri.loc[len(vec_tri)] = new_row

#vec_tri['Normalized Score'] = scaler.fit_transform(vec_tri[['Normalized Score']])

In [42]:
vec_tri.head(10)

Unnamed: 0,Similarity,Normalized Score
0,0.833343,1.0
1,0.615618,0.76
2,0.806057,0.76
3,1.0,0.52
4,0.285876,0.85
5,1.0,0.85
6,0.093948,0.1
7,0.326472,0.32
8,0.44326,0.44
9,0.625417,1.0


#Evaluation

Unigrams

In [43]:
print("MSE: ", metrics.mean_squared_error(vec['Normalized Score'], vec['Similarity']))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(vec['Normalized Score'], vec['Similarity'])))
print("R2: ", metrics.r2_score(vec['Normalized Score'], vec['Similarity']))

MSE:  0.07126283456125121
RMSE:  0.2669509965541451
R2:  0.19284830515971796


Bigrams

In [44]:
print("MSE: ", metrics.mean_squared_error(vec_bi['Normalized Score'], vec_bi['Similarity']))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(vec_bi['Normalized Score'], vec_bi['Similarity'])))
print("R2: ", metrics.r2_score(vec_bi['Normalized Score'], vec_bi['Similarity']))

MSE:  0.07110730941821312
RMSE:  0.26665953839721
R2:  0.19460984585012597


Trigrams

In [45]:
print("MSE: ", metrics.mean_squared_error(vec_tri['Normalized Score'], vec_tri['Similarity']))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(vec_tri['Normalized Score'], vec_tri['Similarity'])))
print("R2: ", metrics.r2_score(vec_tri['Normalized Score'], vec_tri['Similarity']))

MSE:  0.07350220828581208
RMSE:  0.27111290689639267
R2:  0.16748425237836906


# Regression Task

Creation of datasets

In [46]:
dict_train = df_train.to_dict('records')
document_train1 = [f"{sample['sentence1']}" for sample in dict_train]
document_train2 = [f"{sample['sentence2']}" for sample in dict_train]

doc1_train_pos=[]
doc2_train_pos=[]

for i in range(0,len(df_train)):
  parsed_text = nlp_model(document_train1[i])
  doc1_train_pos.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))

for i in range(0,len(df_train)):
  parsed_text = nlp_model(document_train2[i])
  doc2_train_pos.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))


vector_train1 = vectorizer.transform(doc1_train_pos)
vector_train2 = vectorizer.transform(doc2_train_pos)


X_train = vector_train1 + vector_train2
y_train = [sample['score'] for sample in dict_train]

In [47]:
dict_test = df_test.to_dict('records')
document_test1 = [f"{sample['sentence1']}" for sample in dict_test]
document_test2 = [f"{sample['sentence2']}" for sample in dict_test]

doc1_test_pos=[]
doc2_test_pos=[]

for i in range(0,len(df_test)):
  parsed_text = nlp_model(document_test1[i])
  doc1_test_pos.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))

for i in range(0,len(df_test)):
  parsed_text = nlp_model(document_test2[i])
  doc2_test_pos.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))

vector_test1 = vectorizer.transform(doc1_test_pos)
vector_test2 = vectorizer.transform(doc2_test_pos)

X_test = vector_test1 + vector_test2
y_test = [sample['score'] for sample in dict_test]


## Linear regression

In [48]:
from sklearn.linear_model import LinearRegression
import numpy as np

regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [49]:
vec_LR_train = pd.DataFrame(columns=['Prediction','Score'])

y_pred_train = regressor.predict(X_train)

for i in range(0,len(df_train)):
  new_row = {'Prediction': y_pred_train[i], 'Score': y_train[i]}
  vec_LR_train.loc[len(vec_LR_train)] = new_row

vec_LR_train.head(10)

Unnamed: 0,Prediction,Score
0,0.362003,1.0
1,0.501163,0.76
2,0.761718,0.76
3,0.313373,0.52
4,0.603009,0.85
5,0.488274,0.85
6,0.140994,0.1
7,0.334277,0.32
8,0.421191,0.44
9,0.684734,1.0


In [50]:
vec_LR_test = pd.DataFrame(columns=['Prediction','Score'])

y_pred_test = regressor.predict(X_test)

for i in range(0,len(df_test)):
  new_row = {'Prediction': y_pred_test[i], 'Score': y_test[i]}
  vec_LR_test.loc[len(vec_LR_test)] = new_row

vec_LR_test.head(10)

Unnamed: 0,Prediction,Score
0,0.25379,0.5
1,0.640351,0.72
2,0.318791,1.0
3,0.355568,0.84
4,0.459106,0.3
5,0.713639,0.36
6,0.560068,0.7
7,0.510905,0.44
8,0.38136,0.44
9,0.276707,0.3428


In [51]:
print("MSE train: ",metrics.mean_squared_error(y_train, regressor.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, regressor.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, regressor.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, regressor.predict(X_test))))
print("r2: ",metrics.r2_score(y_test, regressor.predict(X_test)))

MSE train:  0.03454022677371789
RMSE train:  0.18585001149776098
r2:  0.5972622302520797
MSE test:  0.34017816059866196
RMSE test:  0.5832479409296375
r2:  -2.657018680534056


# SVR


In [52]:
from sklearn.svm import SVR

regressor = SVR()
parameters = {'C': [10],
             'epsilon': [0.01],
             'gamma':['auto'],
             'kernel': ['linear', 'poly','rbf'],
             'degree': [2,3,5]
             }

#DEFINE YOUR GRIDSEARCH 
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(regressor, parameters, cv=3, verbose = 0)

gs = gs.fit(X_train,y_train)

In [53]:
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


***GRIDSEARCH RESULTS***
Best score: -0.093122 using {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-6.540471 (2.092318) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098618 (0.042239) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.093122 (0.059259) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-6.540471 (2.092318) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.093122 (0.059259) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-6.540471 (2.092318) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.093122 (0.059259) with: {'C': 10, 'degree': 5, 'ep

In [54]:
model = SVR(C= 10, degree= 5, epsilon= 0.01, gamma= 'auto', kernel= 'poly')
model.fit(X_train,y_train)

In [55]:
print("MSE train: ",metrics.mean_squared_error(y_train, model.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, model.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, model.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, model.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test))))
print("r2: ", metrics.r2_score(y_test, model.predict(X_test)))

MSE train:  0.08824362411676642
RMSE train:  0.2970582840399615
r2:  -0.028917401269132004
MSE test:  0.09770146619289292
RMSE test:  0.31257233753627806
r2:  -0.05032047428967523


# Regression Task - sentence concatenation before tf-idf

Creation of datasets

In [56]:
dict_train = df_train.to_dict('records')
document_train = [f"{sample['sentence1']} \n\n {sample['sentence2']}" for sample in dict_train]

doc_train_pos=[]

for i in range(0,len(df_train)):
  parsed_text = nlp_model(document_train[i])
  doc_train_pos.append(''.join([f"{(w,w.pos_)}" for w in parsed_text]))

X_train = vectorizer.transform(doc_train_pos)

y_train = [sample['score'] for sample in dict_train]

In [57]:
dict_test = df_test.to_dict('records')
document_test = [f"{sample['sentence1']} \n\n {sample['sentence2']}" for sample in dict_test]

doc_test_pos=[]

for i in range(0,len(df_test)):
  parsed_text = nlp_model(document_test[i])
  doc_test_pos.append(''.join([f"{(w,w.pos_)}" for w in parsed_text]))

X_test = vectorizer.transform(doc_test_pos)

y_test = [sample['score'] for sample in dict_test]

## Linear regression

In [58]:
from sklearn.linear_model import LinearRegression
import numpy as np

regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [59]:
vec_LR_train = pd.DataFrame(columns=['Prediction','Score'])

y_pred_train = regressor.predict(X_train)

for i in range(0,len(df_train)):
  new_row = {'Prediction': y_pred_train[i], 'Score': y_train[i]}
  vec_LR_train.loc[len(vec_LR_train)] = new_row

vec_LR_train.head(10)

Unnamed: 0,Prediction,Score
0,0.501197,1.0
1,0.556196,0.76
2,0.82934,0.76
3,0.300427,0.52
4,0.746979,0.85
5,0.534527,0.85
6,0.121205,0.1
7,0.32446,0.32
8,0.421226,0.44
9,0.658744,1.0


In [60]:
vec_LR_test = pd.DataFrame(columns=['Prediction','Score'])

y_pred_test = regressor.predict(X_test)

for i in range(0,len(df_test)):
  new_row = {'Prediction': y_pred_test[i], 'Score': y_test[i]}
  vec_LR_test.loc[len(vec_LR_test)] = new_row

vec_LR_test.head(10)

Unnamed: 0,Prediction,Score
0,0.317659,0.5
1,0.61714,0.72
2,0.588539,1.0
3,0.375985,0.84
4,0.471612,0.3
5,0.756714,0.36
6,0.598255,0.7
7,0.528332,0.44
8,0.740595,0.44
9,0.263935,0.3428


In [61]:
print("MSE train: ",metrics.mean_squared_error(y_train, regressor.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, regressor.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, regressor.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, regressor.predict(X_test))))
print("r2: ",metrics.r2_score(y_test, regressor.predict(X_test)))

MSE train:  0.031994141831913035
RMSE train:  0.1788690633729406
r2:  0.6269494867304156
MSE test:  0.31531660369146747
RMSE test:  0.5615305901653689
r2:  -2.389749382949617


# SVR


In [62]:
from sklearn.svm import SVR

regressor = SVR()
parameters = {'C': [10],
             'epsilon': [0.01],
             'gamma':['auto'],
             'kernel': ['linear', 'poly','rbf'],
             'degree': [2,3,5]
             }

#DEFINE YOUR GRIDSEARCH 
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(regressor, parameters, cv=3, verbose = 0)

gs = gs.fit(X_train,y_train)

In [63]:
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


***GRIDSEARCH RESULTS***
Best score: -0.098583 using {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-3.161788 (1.264478) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098588 (0.042247) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.099642 (0.061060) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-3.161788 (1.264478) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.099642 (0.061060) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-3.161788 (1.264478) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.099642 (0.061060) with: {'C': 10, 'degree': 5, 'e

In [64]:
model = SVR(C= 10, degree= 5, epsilon= 0.01, gamma= 'auto', kernel= 'poly')
model.fit(X_train,y_train)

In [65]:
print("MSE train: ",metrics.mean_squared_error(y_train, model.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, model.predict(X_train))))
print("r2: ", metrics.r2_score(y_train, model.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, model.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test))))
print("r2: ", metrics.r2_score(y_test, model.predict(X_test)))

MSE train:  0.08824362411676753
RMSE train:  0.2970582840399633
r2:  -0.028917401269145104
MSE test:  0.09770146619289341
RMSE test:  0.31257233753627883
r2:  -0.05032047428968056


# Regression Task - concatenation of tf-idf matrices

Creation of datasets

In [66]:
from scipy.sparse import coo_matrix, hstack
dict_train = df_train.to_dict('records')
document_train1 = [f"{sample['sentence1']}" for sample in dict_train]
document_train2 = [f"{sample['sentence2']}" for sample in dict_train]

doc1_train_pos=[]
doc2_train_pos=[]

for i in range(0,len(df_train)):
  parsed_text = nlp_model(document_train1[i])
  doc1_train_pos.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))

for i in range(0,len(df_train)):
  parsed_text = nlp_model(document_train2[i])
  doc2_train_pos.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))


vector_train1 = vectorizer.transform(doc1_train_pos)
vector_train2 = vectorizer.transform(doc2_train_pos)



X_train = hstack([vector_train1,vector_train2]).toarray()
y_train = [sample['score'] for sample in dict_train]

In [67]:
dict_test = df_test.to_dict('records')
document_test1 = [f"{sample['sentence1']}" for sample in dict_test]
document_test2 = [f"{sample['sentence2']}" for sample in dict_test]

doc1_test_pos=[]
doc2_test_pos=[]

for i in range(0,len(df_test)):
  parsed_text = nlp_model(document_test1[i])
  doc1_test_pos.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))

for i in range(0,len(df_test)):
  parsed_text = nlp_model(document_test2[i])
  doc2_test_pos.append(' '.join([f"{(w,w.pos_)}" for w in parsed_text]))

vector_test1 = vectorizer.transform(doc1_test_pos)
vector_test2 = vectorizer.transform(doc2_test_pos)

X_test = hstack([vector_test1,vector_test2]).toarray()

y_test = [sample['score'] for sample in dict_test]


## Linear regression

In [68]:
from sklearn.linear_model import LinearRegression
import numpy as np

regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [69]:
vec_LR_train = pd.DataFrame(columns=['Prediction','Score'])

y_pred_train = regressor.predict(X_train)

for i in range(0,len(df_train)):
  new_row = {'Prediction': y_pred_train[i], 'Score': y_train[i]}
  vec_LR_train.loc[len(vec_LR_train)] = new_row

vec_LR_train.head(10)

Unnamed: 0,Prediction,Score
0,0.99099,1.0
1,0.56682,0.76
2,0.69654,0.76
3,0.281671,0.52
4,0.74653,0.85
5,0.639722,0.85
6,0.103506,0.1
7,0.31326,0.32
8,0.593538,0.44
9,0.881862,1.0


In [70]:
vec_LR_test = pd.DataFrame(columns=['Prediction','Score'])

y_pred_test = regressor.predict(X_test)

for i in range(0,len(df_test)):
  new_row = {'Prediction': y_pred_test[i], 'Score': y_test[i]}
  vec_LR_test.loc[len(vec_LR_test)] = new_row

vec_LR_test.head(10)

Unnamed: 0,Prediction,Score
0,-0.02824932,0.5
1,0.8215764,0.72
2,1.122591,1.0
3,0.4157194,0.84
4,-183207900000.0,0.3
5,0.03566215,0.36
6,-0.1899741,0.7
7,1.027983,0.44
8,0.4050656,0.44
9,0.3170498,0.3428


In [71]:
print("MSE train: ",metrics.mean_squared_error(y_train, regressor.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, regressor.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, regressor.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, regressor.predict(X_test))))
print("r2: ",metrics.r2_score(y_test, regressor.predict(X_test)))

MSE train:  0.007664004353870024
RMSE train:  0.0875442993796285
r2:  0.9106379920132839
MSE test:  7.313108311622442e+24
RMSE test:  2704275931117.6885
r2:  -7.861813839344198e+25


# SVR


In [72]:
from sklearn.svm import SVR

regressor = SVR()
parameters = {'C': [10],
             'epsilon': [0.01],
             'gamma':['auto'],
             'kernel': ['linear', 'poly','rbf'],
             'degree': [2,3,5]
             }

#DEFINE YOUR GRIDSEARCH 
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(regressor, parameters, cv=3, verbose = 0)

gs = gs.fit(X_train,y_train)

KeyboardInterrupt: ignored

In [None]:
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


In [None]:
print("MSE train: ",metrics.mean_squared_error(y_train, model.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, model.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, model.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, model.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test))))
print("r2: ", metrics.r2_score(y_test, model.predict(X_test)))