In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
DATASET_FILE = "drive/MyDrive/processed_dataset.csv"
df = pd.read_csv(DATASET_FILE)

In [4]:
# Extracting negative reviews from dataset, her we take 1 and 2 stars reviews
one_star_df = df.loc[df['stars'] == 1]
two_stars_df = df.loc[df['stars'] == 2]
negative_df = one_star_df.append(two_stars_df, ignore_index = True)

negative_df[4998:5002] # checking for the merging at the junction of the dataframes

Unnamed: 0,text,stars,length,clean text
4998,"Awful. Awful. Awful. Cold chicken, semi-stale ...",1,84,awful awful awful cold chicken semi stale chip...
4999,Even if this pizza is the best thing I've ever...,1,54,even pizza best thing ever taste not_order ord...
5000,"As a Northeastern student, I was excited when ...",2,142,northeastern student excite place open last ye...
5001,So I had to knock my review down a couple of s...,2,206,knock review couple star consistency important...


In [5]:
# negative_df[4998:5002] # checking for the merging at the junction of the dataframes

In [6]:
vectorizer = TfidfVectorizer(max_df= 0.8 , min_df= 0.01)
X = vectorizer.fit_transform(negative_df['clean text'])

BOW = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names_out())


In [7]:
def display_topics(model, feature_names, num_top_words, topic_names=None):
    topic_tokens = []
    for ix, topic in enumerate(model.components_):
        print topic, topic number, and top words
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)

        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
        topic_tokens.append(", ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
    return topic_tokens

In [None]:
from sklearn.decomposition import NMF

number_of_topics = 15
nmf_model = NMF(number_of_topics)

doc_topic = nmf_model.fit_transform(BOW)

In [None]:
topic_tokens = display_topics(nmf_model, vectorizer.get_feature_names(), 10)

In [10]:
doc_topic_df = pd.DataFrame(doc_topic)
top_reviews = []

for i in range(number_of_topics):
  top_reviews.append(negative_df['text'][doc_topic_df[i].nlargest(3).index])

In [11]:
top_review_1, top_review_2, top_review_3 = [], [], []

for i in range(len(top_reviews)):
  top_review_1.append(top_reviews[i].iloc[0])
  top_review_2.append(top_reviews[i].iloc[1])
  top_review_3.append(top_reviews[i].iloc[2])


In [12]:
topic_df = pd.DataFrame()
topic_df["topics"] = topic_tokens
topic_df["top_review_1"] = top_review_1
topic_df["top_review_2"] = top_review_2
topic_df["top_review_3"] = top_review_3

In [None]:
topic_df

In [16]:
topic = {'topic1':'ACCUEIL ET SERVICE',
         'topic2':'NOURRITURE ASIATQUE MAUVAISE ',
         'topic3':"TEMPS D'ATTENTE ET PIZZA FROIDE OU TROP CUITE",
         'topic4':'MAUVAISE EXPERIENCE AVEC LE PESONNEL ', 
         'topic5':'PROBLEME BURGER (ERREUR COMMANDE, PRIX ELEVE, ETC)', 
         'topic6':"TEMPS D'ATTENTE TROP LONG", 
         'topic7':'PROBLEME QUALITE BURGER', 
         'topic8':'TRES MAUVAIS SERVICE CLIENT', 
         'topic9':'PROBLEME DE POULET', 
         'topic10':'BAR MAUVAIS EXPEIENCE AVEC LE PERSONNEL', 
         'topic11':'CLIENT DECU, NE REVIENDRA PAS', 
         'topic12':'NOURRITURE JAPONNAISE DECEVANTE', 
         'topic13':'MAUVAIS SANDWICH',
         'topic14':'EXPERIENCE MEDIOCRE, PRIX LEGEREMENT TROP ELEVES', 
         'topic15':'PROBLEME DANS LA COMMANDE', }

In [17]:
topic

{'topic1': 'ACCUEIL ET SERVICE',
 'topic10': 'BAR MAUVAIS EXPEIENCE AVEC LE PERSONNEL',
 'topic11': 'CLIENT DECU, NE REVIENDRA PAS',
 'topic12': 'NOURRITURE JAPONNAISE DECEVANTE',
 'topic13': 'MAUVAIS SANDWICH',
 'topic14': 'EXPERIENCE MEDIOCRE, PRIX LEGEREMENT TROP ELEVES',
 'topic15': 'PROBLEME DANS LA COMMANDE',
 'topic2': 'NOURRITURE ASIATQUE MAUVAISE ',
 'topic3': "TEMPS D'ATTENTE ET PIZZA FROIDE OU TROP CUITE",
 'topic4': 'MAUVAISE EXPERIENCE AVEC LE PESONNEL ',
 'topic5': 'PROBLEME BURGER (ERREUR COMMANDE, PRIX ELEVE, ETC)',
 'topic6': "TEMPS D'ATTENTE TROP LONG",
 'topic7': 'PROBLEME QUALITE BURGER',
 'topic8': 'TRES MAUVAIS SERVICE CLIENT',
 'topic9': 'PROBLEME DE POULET'}

In [25]:
pickle.dump(X, open('drive/MyDrive/vectoriseur', 'wb'))
pickle.dump(doc_topic, open('drive/MyDrive/model', 'wb'))