# **AML 4**

***Teammates:***

Karthik Rajaraman Iyer
kr2859@columbia.edu

Anjani Prasad Atluri
aa4462@columbia.edu

## Task 2

In [0]:
#imports
from google.colab import drive
import pandas as pd
import gc
from nltk.stem.snowball import EnglishStemmer
from sklearn.model_selection import cross_val_score 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import download
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize     
import numpy as np
from gensim import models
from sklearn.preprocessing import FunctionTransformer
import nltk     
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import accuracy_score, r2_score
from sklearn.impute import SimpleImputer
import spacy
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [0]:
#Downloading the model
!python -m spacy download en_core_web_lg

In [0]:
#loading the dataset

#mounting drive
drive.mount("/content/gdrive")

#data loading
df = pd.read_csv('/content/gdrive/My Drive/Columbia Photos/winemag-data-130k-v2.csv')

#subsampling the data
df1=df.sample(frac=0.5, random_state=0)
#df1 = df.copy()

#data splitting
y=pd.DataFrame(df1['points']).values
X= df1.loc[:, df1.columns != 'points']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

#getting the data into required format

#Append the title column data to the description column data and make a list
nX_train= list(X_train['description'] +' '+ X_train['title'])
nX_test = list(X_test['description'] +' '+ X_test['title'])


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## **Just the word embeddings model**

In [0]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner"])

docs_train = [nlp(d).vector for d in nX_train]
X_train = np.vstack(docs_train)

docs_test = [nlp(d).vector for d in nX_test]
X_test = np.vstack(docs_test)


In [0]:
#Training a linear model on just the embeddings 

lr_w2v = Ridge(alpha=1).fit(X_train, y_train)

print("testing score for word embeddings model: ",lr_w2v.score(X_test, y_test))

testing score for word embeddings model:  0.5296116719208357


In [0]:
#deleting un-necessary data to save RAM

del [[df1,df,X,y, docs_train,docs_test,X_train,X_test,en_core_web_lg]]
gc.collect()
df1= pd.DataFrame()
df= pd.DataFrame()
X=pd.DataFrame()
docs_train=[]
X_train=pd.DataFrame()
X_test=pd.DataFrame()
docs_test=[]
y=np.ndarray(shape=(2,2), dtype=float)

## **Just the GLOVE model**

In [0]:
#Loading the Glove model

print("Loading Glove Model")
f = open("/content/gdrive/My Drive/glove.vocab.300d.txt",'r', encoding="utf8")
model = {}
for line in f:
    splitLine = line.split()
    word = splitLine[0]
    #print(type(model))
    model[word] = np.array([float(val) for val in splitLine[1:]])
f.close()
print("Done.",len(model)," words loaded!")

Loading Glove Model
Done. 439  words loaded!


In [0]:
#Writing the Glove avg function
def return_avg_glove(X):
  R = list()
  for desc in X:
    l = 0
    s = np.zeros(300)
    for word in desc.split():
      word = word.replace(",","").replace(")","").replace("(","").replace(".","").replace(";","").lower()
      if word in model:
        s += model[word]
        l += 1
      else:
        pass
        #print(word)
    s /= l
    R.append(s)
  return R


In [0]:
vect = Pipeline([("embedding",FunctionTransformer(return_avg_glove))])
X_train = vect.fit_transform(nX_train)
X_test = vect.transform(nX_test)
clf = LinearRegression().fit(X_train, y_train)
print("testing score for Glove Averaged Model: ",clf.score(X_test,y_test))

testing score for Glove Averaged Model:  0.4451769822978259


As we can see with just the pre-trained embeddings or the GLOVE model we were not able to get any improvement over the previous models. So we will try adding the GLOVE model with the bag of words model.

### **Bag of words model + GLOVE model**

In [0]:
#Making a Stemming function from nltk reference: https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
stemmer = EnglishStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

#Encodings of the bag of words model that performed best form the previous task

vect = make_pipeline(CountVectorizer(min_df=3, stop_words="english",lowercase=True, analyzer=stemmed_words, token_pattern=r"\b\w[\w’]+\b"),FunctionTransformer(lambda x:x.todense()))
teXt_train = vect.fit_transform(nX_train)
teXt_test = vect.transform(nX_test)


In [0]:
#deleting data to save RAM
del [[nX_train,nX_test]]
gc.collect()
nX_train=[]
nX_test=[]

In [0]:
# #Concatenating both the data

X_tr = np.concatenate((X_train, teXt_train),axis=1)

#deleting data to save RAM
del [[teXt_train,X_train]]
gc.collect()
teXt_train=[]
X_train=np.ndarray(shape=(2,2), dtype=float)

X_te = np.concatenate((X_test, teXt_test),axis=1)

#deleting data to save RAM
del [[teXt_test,X_test]]
gc.collect()
teXt_test=[]
X_test=np.ndarray(shape=(2,2), dtype=float)

In [0]:
#Training a linear model on just the embeddings 

lr_w2v = Ridge(alpha=20).fit(X_tr, y_train)

print("testing score for word embeddings model: ",lr_w2v.score(X_te, y_test))

testing score for word embeddings model:  0.7466024942371853


Hence we found that the GLOVE model alone couldn't perform better than models we got from the task 1. The GLOVE model with the bag of words model performed closer to the model we obtained in the task 1.4