## Data preparation

In [1]:
import csv
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/Project')
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive/Colab Notebooks/Project'

In [2]:
import pandas as pd
import string
import re
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

In [3]:
def pre_processing(df: pd.DataFrame) -> pd.DataFrame:
    df['genre'] = df['genre'].replace('main-', '', regex=True)
    df['genre'] = df['genre'].replace('forum', 'forums')
    df['year'] = df['year'].replace(r'\D', '', regex=True)
    df['score'] = MinMaxScaler().fit_transform(df[['score']])
    return df

In [4]:
path_train = 'data/sts-train.csv'
path_dev = 'data/sts-dev.csv'
path_test = 'data/sts-test.csv'

In [5]:
columns=['genre', 'file', 'year', 'index', 'score', 'sentence1', 'sentence2']

In [6]:
df_train = pd.read_csv(path_train, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_dev = pd.read_csv(path_dev, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_test = pd.read_csv(path_test, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')

In [7]:
df_train = pre_processing(df_train)
df_dev = pre_processing(df_dev)
df_test = pre_processing(df_test)

In [8]:
string.punctuation
regex = '[' + string.punctuation + ']'

In [9]:
print(len(df_train))
print(len(df_dev))
print(len(df_test))

5749
1500
1379


In [10]:
df = pd.concat([df_train, df_dev, df_test]).reset_index(drop=True)
df.head(10)

Unnamed: 0,genre,file,year,index,score,sentence1,sentence2
0,captions,MSRvid,2012,1,1.0,A plane is taking off.,An air plane is taking off.
1,captions,MSRvid,2012,4,0.76,A man is playing a large flute.,A man is playing a flute.
2,captions,MSRvid,2012,5,0.76,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,captions,MSRvid,2012,6,0.52,Three men are playing chess.,Two men are playing chess.
4,captions,MSRvid,2012,9,0.85,A man is playing the cello.,A man seated is playing the cello.
5,captions,MSRvid,2012,11,0.85,Some men are fighting.,Two men are fighting.
6,captions,MSRvid,2012,12,0.1,A man is smoking.,A man is skating.
7,captions,MSRvid,2012,13,0.32,The man is playing the piano.,The man is playing the guitar.
8,captions,MSRvid,2012,14,0.44,A man is playing on a guitar and singing.,A woman is playing an acoustic guitar and sing...
9,captions,MSRvid,2012,16,1.0,A person is throwing a cat on to the ceiling.,A person throws a cat on the ceiling.


## TF-IDF bag-of-words vectors

In [11]:
for i in range(0,len(df_train)):
  df_train['sentence1'][i] = df_train['sentence1'][i].lower()
  df_train['sentence2'][i] = df_train['sentence2'][i].lower()
  df_train['sentence1'][i] = re.sub(regex,' ',df_train['sentence1'][i])
  df_train['sentence2'][i] = re.sub(regex,' ',df_train['sentence2'][i])

for i in range(0,len(df_test)):
  df_test['sentence1'][i] = df_test['sentence1'][i].lower()
  df_test['sentence2'][i] = df_test['sentence2'][i].lower()
  df_test['sentence1'][i] = re.sub(regex,' ',df_test['sentence1'][i])
  df_test['sentence2'][i] = re.sub(regex,' ',df_test['sentence2'][i])

for i in range(0,len(df_dev)):
  df_dev['sentence1'][i] = df_dev['sentence1'][i].lower()
  df_dev['sentence2'][i] = df_dev['sentence2'][i].lower()
  df_dev['sentence1'][i] = re.sub(regex,' ',df_dev['sentence1'][i])
  df_dev['sentence2'][i] = re.sub(regex,' ',df_dev['sentence2'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['sentence1'][i] = df_train['sentence1'][i].lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['sentence2'][i] = df_train['sentence2'][i].lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['sentence1'][i] = re.sub(regex,' ',df_train['sentence1'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a

In [12]:
dict = df.to_dict('records')
dict[0]

{'genre': 'captions',
 'file': 'MSRvid',
 'year': '2012',
 'index': 1,
 'score': 1.0,
 'sentence1': 'A plane is taking off.',
 'sentence2': 'An air plane is taking off.'}

In [13]:
document = [f"{sample['sentence1']} \n\n {sample['sentence2']}" for sample in dict]
document[0]

'A plane is taking off. \n\n An air plane is taking off.'

In [14]:
document1 = [f"{sample['sentence1']}" for sample in dict]
document1[0]

'A plane is taking off.'

In [15]:
document2 = [f"{sample['sentence2']}" for sample in dict]
document2[0]

'An air plane is taking off.'

In [16]:
scores = [sample['score'] for sample in dict]
len(scores)

8628

#Create vectorizer with unigrams

In [17]:
vectorizer = TfidfVectorizer(analyzer= 'word', max_df=0.8, min_df=4,stop_words="english")
vectorizer.fit(document)

In [18]:
vocab = vectorizer.get_feature_names_out()
print(f"Length of vocabulary: {len(vocab)}")

Length of vocabulary: 3816


In [19]:
vector_document1 = vectorizer.transform(document1)
vector_document1

<8628x3816 sparse matrix of type '<class 'numpy.float64'>'
	with 42181 stored elements in Compressed Sparse Row format>

In [20]:
sorted([(vocab[j], vector_document1[0, j]) for j in vector_document1[0].nonzero()[1]], key=lambda x: -x[1])

[('taking', 0.7190478777658711), ('plane', 0.6949605380742111)]

In [21]:
vector_document2 = vectorizer.transform(document2)
vector_document2

<8628x3816 sparse matrix of type '<class 'numpy.float64'>'
	with 42087 stored elements in Compressed Sparse Row format>

In [22]:
sorted([(vocab[j], vector_document2[0, j]) for j in vector_document2[0].nonzero()[1]], key=lambda x: -x[1])

[('taking', 0.5992134496233458),
 ('plane', 0.5791404359128616),
 ('air', 0.552756363401771)]

Compute similarity

In [23]:
vec = pd.DataFrame(columns=['Similarity','Normalized Score'])

for i in range(0,len(df)):
  new_row = {'Similarity': vector_document1[i].multiply(vector_document2[i]).sum(), 'Normalized Score': df['score'][i]}
  vec.loc[len(vec)] = new_row


#scaler = MinMaxScaler()
#vec['Normalized Score'] = scaler.fit_transform(vec[['Normalized Score']])

In [24]:
vec.head(10)

Unnamed: 0,Similarity,Normalized Score
0,0.833343,1.0
1,0.825563,0.76
2,0.854661,0.76
3,1.0,0.52
4,0.534796,0.85
5,1.0,0.85
6,0.134325,0.1
7,0.45136,0.32
8,0.659117,0.44
9,0.625417,1.0


# Extending the BOW vector to include n-grams

Include bigrams

In [25]:
vectorizer_bi = TfidfVectorizer(analyzer= 'word', max_df=0.8, min_df=4, stop_words="english", ngram_range=(1,2))
vectorizer_bi.fit(document)

In [26]:
vocab_bi = vectorizer_bi.get_feature_names_out()
print(f"Length of vocabulary: {len(vocab_bi)}")

Length of vocabulary: 5436


In [27]:
vector_document1_bi = vectorizer_bi.transform(document1)

In [28]:
vector_document2_bi = vectorizer_bi.transform(document2)

In [29]:
vec_bi = pd.DataFrame(columns=['Similarity','Normalized Score'])

for i in range(0,len(df)):
  new_row = {'Similarity': vector_document1_bi[i].multiply(vector_document2_bi[i]).sum(), 'Normalized Score': df['score'][i]}
  vec_bi.loc[len(vec_bi)] = new_row

#vec_bi['Normalized Score'] = scaler.fit_transform(vec_bi[['Normalized Score']])

In [30]:
vec_bi.head(10)

Unnamed: 0,Similarity,Normalized Score
0,0.833343,1.0
1,0.715372,0.76
2,0.806057,0.76
3,1.0,0.52
4,0.285876,0.85
5,1.0,0.85
6,0.093948,0.1
7,0.436423,0.32
8,0.537395,0.44
9,0.625417,1.0


Include trigrams

In [31]:
vectorizer_tri = TfidfVectorizer(analyzer= 'word', max_df=0.8, min_df=4, stop_words="english", ngram_range=(1,3))
vectorizer_tri.fit(document)

In [32]:
vocab_tri = vectorizer_tri.get_feature_names_out()
print(f"Length of vocabulary: {len(vocab_tri)}")

Length of vocabulary: 5976


In [33]:
vector_document1_tri = vectorizer_tri.transform(document1)

In [34]:
vector_document2_tri = vectorizer_tri.transform(document2)

In [35]:
vec_tri = pd.DataFrame(columns=['Similarity','Normalized Score'])

for i in range(0,len(df)):
  new_row = {'Similarity': vector_document1_tri[i].multiply(vector_document2_tri[i]).sum(), 'Normalized Score': df['score'][i]}
  vec_tri.loc[len(vec_tri)] = new_row

#vec_tri['Normalized Score'] = scaler.fit_transform(vec_tri[['Normalized Score']])

In [36]:
vec_tri.head(10)

Unnamed: 0,Similarity,Normalized Score
0,0.833343,1.0
1,0.615618,0.76
2,0.806057,0.76
3,1.0,0.52
4,0.285876,0.85
5,1.0,0.85
6,0.093948,0.1
7,0.326472,0.32
8,0.44326,0.44
9,0.625417,1.0


#Evaluation

Unigrams

In [37]:
print("MSE: ", metrics.mean_squared_error(vec['Normalized Score'], vec['Similarity']))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(vec['Normalized Score'], vec['Similarity'])))
print("R2: ", metrics.r2_score(vec['Normalized Score'], vec['Similarity']))

MSE:  0.06223289804452963
RMSE:  0.24946522411857255
R2:  0.29512501936349267


Bigrams

In [38]:
print("MSE: ", metrics.mean_squared_error(vec_bi['Normalized Score'], vec_bi['Similarity']))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(vec_bi['Normalized Score'], vec_bi['Similarity'])))
print("R2: ", metrics.r2_score(vec_bi['Normalized Score'], vec_bi['Similarity']))

MSE:  0.07110730941821312
RMSE:  0.26665953839721
R2:  0.19460984585012597


Trigrams

In [39]:
print("MSE: ", metrics.mean_squared_error(vec_tri['Normalized Score'], vec_tri['Similarity']))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(vec_tri['Normalized Score'], vec_tri['Similarity'])))
print("R2: ", metrics.r2_score(vec_tri['Normalized Score'], vec_tri['Similarity']))

MSE:  0.07350220828581208
RMSE:  0.27111290689639267
R2:  0.16748425237836906


# Regression task - sum of tf-idf matrices

Creation of datasets

In [40]:
dict_train = df_train.to_dict('records')
document_train1 = [f"{sample['sentence1']}" for sample in dict_train]
document_train2 = [f"{sample['sentence2']}" for sample in dict_train]

vector_train1 = vectorizer.transform(document_train1)
vector_train2 = vectorizer.transform(document_train2)

X_train = vector_train1 + vector_train2
y_train = [sample['score'] for sample in dict_train]

In [41]:
X_train

<5749x3816 sparse matrix of type '<class 'numpy.float64'>'
	with 41823 stored elements in Compressed Sparse Row format>

In [42]:
dict_dev = df_dev.to_dict('records')
document_dev1 = [f"{sample['sentence1']}" for sample in dict_dev]
document_dev2 = [f"{sample['sentence2']}" for sample in dict_dev]

vector_dev1 = vectorizer.transform(document_dev1)
vector_dev2 = vectorizer.transform(document_dev2)

X_dev = vector_dev1 + vector_dev2
y_dev = [sample['score'] for sample in dict_dev]

In [43]:
dict_test = df_test.to_dict('records')
document_test1 = [f"{sample['sentence1']}" for sample in dict_test]
document_test2 = [f"{sample['sentence2']}" for sample in dict_test]

vector_test1 = vectorizer.transform(document_test1)
vector_test2 = vectorizer.transform(document_test2)

X_test = vector_test1 + vector_test2
y_test = [sample['score'] for sample in dict_test]

# Linear Regression



In [44]:
from sklearn.linear_model import LinearRegression
import numpy as np

regressor = LinearRegression(n_jobs=-1)
regressor.fit(X_train,y_train)

In [45]:
vec_LR_train = pd.DataFrame(columns=['Prediction','Score'])

y_pred_train = regressor.predict(X_train)

for i in range(0,len(df_train)):
  new_row = {'Prediction': y_pred_train[i], 'Score': y_train[i]}
  vec_LR_train.loc[len(vec_LR_train)] = new_row

vec_LR_train.head(20)

Unnamed: 0,Prediction,Score
0,0.387597,1.0
1,0.512982,0.76
2,0.732043,0.76
3,0.241938,0.52
4,0.486144,0.85
5,0.470404,0.85
6,0.160254,0.1
7,0.345205,0.32
8,0.45267,0.44
9,0.711063,1.0


In [46]:
vec_LR_test = pd.DataFrame(columns=['Prediction','Score'])

y_pred_test = regressor.predict(X_test)

for i in range(0,len(df_test)):
  new_row = {'Prediction': y_pred_test[i], 'Score': y_test[i]}
  vec_LR_test.loc[len(vec_LR_test)] = new_row

vec_LR_test.head(10)

Unnamed: 0,Prediction,Score
0,0.701119,0.5
1,0.69982,0.72
2,0.003238,1.0
3,0.395203,0.84
4,0.310484,0.3
5,0.678913,0.36
6,0.58341,0.7
7,0.543909,0.44
8,0.613115,0.44
9,0.284552,0.3428


In [47]:
print("MSE train: ",metrics.mean_squared_error(y_train, regressor.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, regressor.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, regressor.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, regressor.predict(X_test))))
print("r2: ",metrics.r2_score(y_test, regressor.predict(X_test)))

MSE train:  0.027322404388612994
RMSE train:  0.1652949012783304
r2:  0.6814217729458077
MSE test:  0.7166993647607123
RMSE test:  0.8465809853526787
r2:  -6.704736131926527


# SVR


In [48]:
from sklearn.svm import SVR

regressor = SVR()
parameters = {'C': [10],
             'epsilon': [0.01],
             'gamma':['auto'],
             'kernel': ['linear', 'poly','rbf'],
             'degree': [2,3,5]
             }

#DEFINE YOUR GRIDSEARCH 
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(regressor, parameters, cv=3, verbose = 0)

gs = gs.fit(X_train,y_train)

In [49]:
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

***GRIDSEARCH RESULTS***
Best score: -0.090068 using {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-6.579101 (2.828699) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098653 (0.042231) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.090068 (0.059054) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-6.579101 (2.828699) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.090068 (0.059054) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-6.579101 (2.828699) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.090068 (0.059054) with: {'C': 10, 'degree': 5, 'ep

In [50]:
model = SVR(C= 10, degree= 2, epsilon= 0.01, gamma= 'auto', kernel= 'rbf')
model.fit(X_train,y_train)

In [51]:
print("MSE train: ",metrics.mean_squared_error(y_train, model.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, model.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, model.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, model.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test))))
print("r2: ", metrics.r2_score(y_test, model.predict(X_test)))

MSE train:  0.0749094253382429
RMSE train:  0.27369586284458686
r2:  0.126558864495407
MSE test:  0.08988995465546315
RMSE test:  0.2998165349934242
r2:  0.033655650354289746


# Regression task - senntence concatenation before tf-idf

Creation of datasets

In [52]:
dict_train = df_train.to_dict('records')
document_train = [f"{sample['sentence1']} \n\n {sample['sentence2']}" for sample in dict_train]

X_train = vectorizer.transform(document_train)

y_train = [sample['score'] for sample in dict_train]

In [53]:
dict_test = df_test.to_dict('records')
document_test = [f"{sample['sentence1']} \n\n {sample['sentence2']}" for sample in dict_test]

X_test = vectorizer.transform(document_test)

y_test = [sample['score'] for sample in dict_test]

# Linear Regression



In [54]:
from sklearn.linear_model import LinearRegression
import numpy as np

regressor = LinearRegression(n_jobs=-1)
regressor.fit(X_train,y_train)

In [55]:
vec_LR_train = pd.DataFrame(columns=['Prediction','Score'])

y_pred_train = regressor.predict(X_train)

for i in range(0,len(df_train)):
  new_row = {'Prediction': y_pred_train[i], 'Score': y_train[i]}
  vec_LR_train.loc[len(vec_LR_train)] = new_row

vec_LR_train.head(10)

Unnamed: 0,Prediction,Score
0,0.516687,1.0
1,0.531828,0.76
2,0.688237,0.76
3,0.42032,0.52
4,0.644763,0.85
5,0.584369,0.85
6,0.143403,0.1
7,0.331666,0.32
8,0.454864,0.44
9,0.686846,1.0


In [56]:
vec_LR_test = pd.DataFrame(columns=['Prediction','Score'])

y_pred_test = regressor.predict(X_test)

for i in range(0,len(df_test)):
  new_row = {'Prediction': y_pred_test[i], 'Score': y_test[i]}
  vec_LR_test.loc[len(vec_LR_test)] = new_row

vec_LR_test.head(10)

Unnamed: 0,Prediction,Score
0,0.689851,0.5
1,0.673249,0.72
2,0.173208,1.0
3,0.465855,0.84
4,0.290945,0.3
5,0.579082,0.36
6,0.571683,0.7
7,0.546908,0.44
8,0.794837,0.44
9,0.250742,0.3428


In [57]:
print("MSE train: ",metrics.mean_squared_error(y_train, regressor.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, regressor.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, regressor.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, regressor.predict(X_test))))
print("r2: ",metrics.r2_score(y_test, regressor.predict(X_test)))

MSE train:  0.023631540616226153
RMSE train:  0.15372553664315552
r2:  0.7244571083497295
MSE test:  0.6292206151263018
RMSE test:  0.7932342750576917
r2:  -5.76431296954654


# SVR


In [58]:
from sklearn.svm import SVR

regressor = SVR()
parameters = {'C': [10],
             'epsilon': [0.01],
             'gamma':['auto'],
             'kernel': ['linear', 'poly','rbf'],
             'degree': [2,3,5]
             }

#DEFINE YOUR GRIDSEARCH 
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(regressor, parameters, cv=3, verbose = 0)

gs = gs.fit(X_train,y_train)

In [59]:
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

***GRIDSEARCH RESULTS***
Best score: -0.097479 using {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-3.225942 (1.555890) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098591 (0.042246) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.097479 (0.062046) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-3.225942 (1.555890) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.097479 (0.062046) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-3.225942 (1.555890) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.097479 (0.062046) with: {'C': 10, 'degree': 5, 'ep

In [60]:
model = SVR(C= 10, degree= 2, epsilon= 0.01, gamma= 'auto', kernel= 'rbf')
model.fit(X_train,y_train)

In [61]:
print("MSE train: ",metrics.mean_squared_error(y_train, model.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, model.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, model.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, model.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test))))
print("r2: ", metrics.r2_score(y_test, model.predict(X_test)))

MSE train:  0.07965688592102446
RMSE train:  0.28223551498885546
r2:  0.07120364926762324
MSE test:  0.09088573210479026
RMSE test:  0.3014726058944498
r2:  0.022950739940772458


# Regression task - concatenation of tf-idf matrices

Creation of datasets

In [62]:
from scipy.sparse import coo_matrix, hstack
dict_train = df_train.to_dict('records')
document_train1 = [f"{sample['sentence1']}" for sample in dict_train]
document_train2 = [f"{sample['sentence2']}" for sample in dict_train]

vector_train1 = vectorizer.transform(document_train1)
vector_train2 = vectorizer.transform(document_train2)

X_train = hstack([vector_train1,vector_train2]).toarray()

y_train = [sample['score'] for sample in dict_train]

In [63]:
dict_test = df_test.to_dict('records')
document_test1 = [f"{sample['sentence1']}" for sample in dict_test]
document_test2 = [f"{sample['sentence2']}" for sample in dict_test]

vector_test1 = vectorizer.transform(document_test1)
vector_test2 = vectorizer.transform(document_test2)

X_test = hstack([vector_test1,vector_test2]).toarray()

y_test = [sample['score'] for sample in dict_test]
X_test.shape

(1379, 7632)

# Linear Regression



In [64]:
from sklearn.linear_model import LinearRegression
import numpy as np

regressor = LinearRegression(n_jobs=-1)
regressor.fit(X_train,y_train)

In [65]:
vec_LR_train = pd.DataFrame(columns=['Prediction','Score'])

y_pred_train = regressor.predict(X_train)

for i in range(0,len(df_train)):
  new_row = {'Prediction': y_pred_train[i], 'Score': y_train[i]}
  vec_LR_train.loc[len(vec_LR_train)] = new_row

vec_LR_train.head(10)

Unnamed: 0,Prediction,Score
0,0.996337,1.0
1,0.581409,0.76
2,0.944558,0.76
3,0.423856,0.52
4,0.924186,0.85
5,0.700228,0.85
6,0.173373,0.1
7,0.304738,0.32
8,0.468237,0.44
9,0.952718,1.0


In [66]:
vec_LR_test = pd.DataFrame(columns=['Prediction','Score'])

y_pred_test = regressor.predict(X_test)

for i in range(0,len(df_test)):
  new_row = {'Prediction': y_pred_test[i], 'Score': y_test[i]}
  vec_LR_test.loc[len(vec_LR_test)] = new_row

vec_LR_test.head(10)

Unnamed: 0,Prediction,Score
0,-866430700000.0,0.5
1,1.535123,0.72
2,1907541000000.0,1.0
3,0.4778207,0.84
4,-3094559000000.0,0.3
5,-0.1303127,0.36
6,0.4960511,0.7
7,0.8805609,0.44
8,7078505000.0,0.44
9,0.3773023,0.3428


In [67]:
print("MSE train: ",metrics.mean_squared_error(y_train, regressor.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, regressor.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, regressor.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, regressor.predict(X_test))))
print("r2: ",metrics.r2_score(y_test, regressor.predict(X_test)))

MSE train:  0.004091879597116368
RMSE train:  0.06396780125278942
r2:  0.9522888348760462
MSE test:  1.8570961102845743e+25
RMSE test:  4309403799001.1733
r2:  -1.9964347960803613e+26


# SVR


In [68]:
from sklearn.svm import SVR

regressor = SVR()
parameters = {'C': [10],
             'epsilon': [0.01],
             'gamma':['auto'],
             'kernel': ['linear', 'poly','rbf'],
             'degree': [2,3,5]
             }

#DEFINE YOUR GRIDSEARCH 
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(regressor, parameters, cv=3, verbose = 0)

gs = gs.fit(X_train,y_train)

In [69]:
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

#Returns the coefficient of determination R^2 of the prediction.
#Explained variance score: 1 is perfect prediction
gs.score(X_test, y_test)

***GRIDSEARCH RESULTS***
Best score: -0.098583 using {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-1.828273 (1.044901) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098586 (0.042247) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.099584 (0.052758) with: {'C': 10, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-1.828273 (1.044901) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.099584 (0.052758) with: {'C': 10, 'degree': 3, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
-1.828273 (1.044901) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
-0.098583 (0.042246) with: {'C': 10, 'degree': 5, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
-0.099584 (0.052758) with: {'C': 10, 'degree': 5, 'e

-0.05032047428968056

In [71]:
print("MSE train: ",metrics.mean_squared_error(y_train, model.predict(X_train)))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train, model.predict(X_train))))
print("r2: ",metrics.r2_score(y_train, model.predict(X_train)))

print("MSE test: ",metrics.mean_squared_error(y_test, model.predict(X_test)))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test))))
print("r2: ", metrics.r2_score(y_test, model.predict(X_test)))

ValueError: ignored