# NLP word embedding

In [23]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso





from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack




[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


This include all NLP approaches for word embedding

Prediction of Likes and Retweets Using Text Information Retrieval
https://ai.intelligentonlinetools.com/ml/text-clustering-doc2vec-word-embedding-machine-learning/

Paper:
1-s2.0-S1877050920304129-main.pdf


Le github magique:
https://github.com/buomsoo-kim/Word-embedding-with-Python

French corpus: https://stackoverflow.com/questions/42058396/python-nltk-and-textblob-in-french

## Word embedding à la main avec Doc 2 Vec

In [24]:
import re

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from scipy import spatial




In [25]:
#télcharge "quinze essais politiques" dans data et en fait une liste
# opening the file in read mode
my_file = open("data/quinze_essais_politiques.txt", "r")
  
# reading the file
corpus = my_file.read()
#remove \n
corpus = corpus.replace('\n', ' ')
corpus = corpus.replace('.', ',')

  
# replacing end splitting the text 
# when newline ('\n') is seen.
corpus = corpus.split(",")
my_file.close()
corpus[5]

'              David Hume “Essai sur la liberté de la presse”'

Use re module to preprocess data

Convert all letters into lowercase

Remove punctuations, numbers, etc

In [26]:
for i in range(len(corpus)):
    corpus[i] = corpus[i].lower()
    #remove punctuation
    corpus[i] = re.sub(r'[^\w\s]','',corpus[i])
    #make a list of corpus[i]
    corpus[i] = corpus[i].split()
    

corpus[5]


['david', 'hume', 'essai', 'sur', 'la', 'liberté', 'de', 'la', 'presse']

For the doc2vec model, input data should be in format of iterable TaggedDocuments"

Each TaggedDocument instance comprises words and tags

Hence, each document (i.e., a sentence or paragraph) should have a unique tag which is identifiable

In [27]:
for i in range(len(corpus)):
    corpus[i] = TaggedDocument(words = corpus[i], tags = ['sent{}'.format(i)])    # converting each sentence into a TaggedDocument
corpus[5]

TaggedDocument(words=['david', 'hume', 'essai', 'sur', 'la', 'liberté', 'de', 'la', 'presse'], tags=['sent5'])

In [28]:
model = Doc2Vec(documents = corpus, vector_size = 100, min_count = 1)
model.init_sims(replace = True)

model.save('doc2vec_model')
model = Doc2Vec.load('doc2vec_model')


In [29]:
v1 = model.infer_vector(['Macron',' démission'])    # in doc2vec, infer_vector() function is used to infer the vector embedding of a document
v2 = model.infer_vector(['gilets jaune'])    # in doc2vec, infer_vector() function is used to infer the vector embedding of a document
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)
cosine_similarity(v1, v2)


0.14602068066596985

# Word embedding with transfer learning

In [30]:
import spacy
# Load the spacy model that you have installed
nlp = spacy.load('fr_core_news_sm')
# process a sentence using the model
doc = nlp("Macron démission")
# It's that simple - all of the vectors and words are assigned after this point
# Get the vector for 'text':
doc.vector


array([ 2.77224708e+00, -8.20644140e-01, -3.03102303e+00,  1.71295309e+00,
       -1.51811123e-01, -6.19174361e-01,  8.30909967e-01,  3.04782152e+00,
        3.13939261e+00, -3.14197016e+00,  3.75156593e+00,  7.06376970e-01,
        4.97845650e-01, -7.39134669e-01, -1.51396203e+00, -6.00879252e-01,
        1.98837304e+00, -1.65502715e+00, -2.15656042e-01, -3.06843376e+00,
       -2.57701969e+00, -6.50485158e-02, -1.93754995e+00, -1.44916689e+00,
       -1.40775919e+00, -3.76376247e+00,  7.35826492e-02,  5.47314596e+00,
        2.39667892e+00, -1.73095465e-02, -3.04729295e+00,  3.57966995e+00,
       -4.89883900e-01,  1.82727182e+00, -7.40789533e-01, -2.31122684e+00,
       -6.91391945e-01, -1.97245240e+00,  6.30308867e-01, -3.27573270e-01,
       -3.56692076e-01, -8.23963046e-01,  3.11563540e+00, -4.16857243e+00,
       -1.66391611e+00, -2.21248603e+00,  2.82474899e+00,  5.70967078e-01,
        5.85822582e-01,  2.84727335e+00,  4.80433702e-01, -8.01777303e-01,
       -3.93913603e+00, -

# Model with Spacy text embedding



## Data

In [31]:
# Load the training data
train_data = pd.read_csv("data/train.csv")

# Load the evaluation data
eval_data = pd.read_csv("data/evaluation.csv")


# Here we split our training data into trainig and testing set. This way we can estimate the evaluation of our model without uploading to Kaggle and avoid overfitting over our evaluation dataset.
# scsplit method is used in order to split our regression data in a stratisfied way and keep a similar distribution of retweet counts between the two sets
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.7, test_size=0.3)
# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(['retweets_count'], axis=1)
X_test = X_test.drop(['retweets_count'], axis=1)

In [33]:
#use nlp model fro text train data
X_train_txt = X_train['text'].apply(lambda x: nlp(x).vector)
#use nlp model fro text eval data
print("starting vecoring X_test")
X_test_txt = X_test['text'].apply(lambda x: nlp(x).vector)

#expected duration: 26min


NameError: name 'RamdomForestRegressor' is not defined

In [38]:
#X_test_txt to pd
X_test_txt = pd.DataFrame(X_test_txt.to_list())
#X_train_txt to pd
X_train_txt = pd.DataFrame(X_train_txt.to_list())


In [39]:
from sklearn.ensemble import RandomForestRegressor
# We fit our model using the training data
print("starting model training")
reg = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0)
reg.fit(X_train_txt, y_train)
# Predict the number of retweets for the evaluation dataset
y_pred = reg.predict(X_test_txt)
# We want to make sure that all predictions are non-negative integers
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]

print("Text RF", mean_absolute_error(y_true=y_test, y_pred=y_pred))

starting model training
