In [1]:
import pandas as pd
import numpy as np

# Importing necessary libraries

In [2]:
# Importing message.csv from review.csv
messages = pd.read_csv('messages.csv')

In [3]:
messages.shape

(98285, 2)

In [5]:
# importing cleaned data
df1 = pd.read_csv('fake_data_cleaned.csv')

In [6]:
#displaying top 5 records of df1
df1.head()

Unnamed: 0,label,text_
0,1,love well made sturdy comfortable love pretty
1,1,love great upgrade original mine couple year
2,1,pillow saved back love look feel pillow
3,1,missing information use great product price
4,1,nice set good quality set two month


In [7]:
# checking number of rows and columns
df1.shape

(40432, 2)

In [8]:
# changing column text_ into text
df1.columns = ['label' , 'text']

In [9]:
# checking null values and datatype of each column
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   40432 non-null  int64 
 1   text    40431 non-null  object
dtypes: int64(1), object(1)
memory usage: 631.9+ KB


In [10]:
# displaying row number where the null value exist

import numpy as np
np.where(df1.text.isnull())

(array([37914], dtype=int64),)

In [11]:
# dropping the row having null value
df1.drop([37914] , inplace = True)

In [12]:
# checking null values and datatype of each column
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40431 entries, 0 to 40431
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   40431 non-null  int64 
 1   text    40431 non-null  object
dtypes: int64(1), object(1)
memory usage: 947.6+ KB


In [13]:
# concating the dataset df1 and messages
# df1 --> labeled dataset
# messages --> scrapped dataset
data = pd.concat([df1 , messages] , axis = 0)

In [14]:
# reseting the index
data.index = range(len(data))
data.tail() # displaying bottom 5 records

Unnamed: 0,label,text
138711,0,Exceptional hospitality in every way at Triden...
138712,0,I have had the privilege of staying at 5 star ...
138713,0,A lovely experience at Trident. The atmosphere...
138714,0,""" thanks to trident .The service of the hotel ..."
138715,0,Thanks to trident agra we loved it here and en...


## TFIDF 

In [15]:
# importing library 
from sklearn.feature_extraction.text import TfidfVectorizer

#creating object vectorizer
vectorizer = TfidfVectorizer(max_features = 10000) #limiting the number of features to 10000
vectorized_corpus = vectorizer.fit_transform(data['text']).toarray() #coverting text column into array

df_temp = pd.DataFrame(vectorized_corpus,columns=vectorizer.get_feature_names_out())

print(df_temp) #printing the rows and columns

         00  000  00am  00pm   01   02   03   05   06   09  ...  zirakpur  \
0       0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   
1       0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   
2       0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   
3       0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   
4       0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   
...     ...  ...   ...   ...  ...  ...  ...  ...  ...  ...  ...       ...   
138711  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   
138712  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   
138713  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   
138714  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   
138715  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0   

        zodiac  zomato  zombie  zone  zones  zoo  zoom  zoya   और  
0      

In [16]:
# printing vectorized_corpus
print(vectorized_corpus)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
# printing count of each vectorizer
vectorizer.vocabulary_

{'love': 5181,
 'well': 9758,
 'made': 5225,
 'sturdy': 8568,
 'comfortable': 1903,
 'pretty': 6724,
 'great': 3847,
 'upgrade': 9395,
 'original': 6103,
 'mine': 5552,
 'couple': 2160,
 'year': 9951,
 'pillow': 6462,
 'saved': 7652,
 'back': 826,
 'look': 5156,
 'feel': 3308,
 'missing': 5576,
 'information': 4429,
 'use': 9426,
 'product': 6774,
 'price': 6729,
 'nice': 5862,
 'set': 7823,
 'good': 3782,
 'quality': 6896,
 'two': 9289,
 'month': 5641,
 'wanted': 9666,
 'different': 2570,
 'flavor': 3410,
 'perfect': 6366,
 'touch': 9119,
 'thing': 8968,
 'wish': 9851,
 'little': 5111,
 'space': 8269,
 'done': 2717,
 'fit': 3388,
 'edge': 2869,
 'extra': 3202,
 'big': 1097,
 'number': 5952,
 'easy': 2850,
 'read': 7046,
 'like': 5064,
 'size': 8072,
 'son': 8215,
 'also': 407,
 'baby': 825,
 'advertised': 287,
 'th': 8916,
 'one': 6038,
 'problem': 6764,
 'really': 7063,
 'handy': 3975,
 'kid': 4800,
 'tool': 9099,
 'included': 4383,
 'package': 6170,
 'someone': 8207,
 'say': 7661,
 

In [18]:
# checking number of rows and columns
vectorized_corpus.shape

(138716, 10000)

In [19]:
# putting the labeled data into X to train the model
X = vectorized_corpus[:40431,:]

In [20]:
# checking number of rows and columns
X.shape

(40431, 10000)

In [21]:
# putting the label column from df1 into y
y = df1.iloc[:,0].values

In [22]:
# checking number of rows and columns
y.shape

(40431,)

In [23]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.20 , random_state = 0)

### TFIDF ==> Naive Bayes Classifier

In [27]:
# Training model using Naive Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

fake_detect_model = MultinomialNB().fit(X_train , y_train)

y_pred = fake_detect_model.predict(X_test)

In [31]:
# Now we have trained our model to identify fake reviews, and now we will check from our webscraped data 
#how many of the data we have are fake

web_scraped_data = vectorized_corpus[40431:,:]


In [32]:
# checking number of rows and columns
web_scraped_data.shape

(98285, 10000)

In [44]:
# displaying the total number of fake reviews in web_scraped_data
# Naive Bayes Classifier
sum(fake_detect_model.predict(web_scraped_data))

12166

In [45]:
df = pd.read_csv('reviews.csv')

In [46]:
df.head()

Unnamed: 0.1,Unnamed: 0,city_name,place_to_visit,place_name,reviews
0,0,Kolkata,Restaurants,Ottimo Cucina Italiana,Mr.Prakash Kumar and Mr.Sourav both are one on...
1,1,Kolkata,Restaurants,Ottimo Cucina Italiana,Absolutely great place for gathering and indee...
2,2,Kolkata,Restaurants,Ottimo Cucina Italiana,Pizza was good.Chefs antipasti antipasti selec...
3,3,Kolkata,Restaurants,Ottimo Cucina Italiana,It was a wonderful dinning experience in the r...
4,4,Kolkata,Restaurants,Ottimo Cucina Italiana,Had a really great time with family.\nFood was...


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98623 entries, 0 to 98622
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      98623 non-null  int64 
 1   city_name       98623 non-null  object
 2   place_to_visit  98623 non-null  object
 3   place_name      98623 non-null  object
 4   reviews         98623 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [48]:
df.drop(['Unnamed: 0'] , axis = 1 , inplace = True)
df = df.drop(df.iloc[np.where(df.city_name == 'city_name')].index , axis = 0)

# replacing the value names to avoid duplicacy in the data.

df = df.replace('jaipur' , 'Jaipur')
df = df.replace('agra' , 'Agra')
df = df.replace('Things_to_do' , 'Things')



In [49]:
messages.shape

(98285, 2)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98285 entries, 0 to 98622
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   city_name       98285 non-null  object
 1   place_to_visit  98285 non-null  object
 2   place_name      98285 non-null  object
 3   reviews         98285 non-null  object
dtypes: object(4)
memory usage: 3.7+ MB


In [51]:
df.isna().sum()

city_name         0
place_to_visit    0
place_name        0
reviews           0
dtype: int64

In [52]:
web_scraped_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
# creating a new column fake which determines whether the reviews is fake or original

df['Fake'] = fake_detect_model.predict(web_scraped_data)

In [57]:
df.head()

Unnamed: 0,city_name,place_to_visit,place_name,reviews,Fake
0,Kolkata,Restaurants,Ottimo Cucina Italiana,Mr.Prakash Kumar and Mr.Sourav both are one on...,0
1,Kolkata,Restaurants,Ottimo Cucina Italiana,Absolutely great place for gathering and indee...,0
2,Kolkata,Restaurants,Ottimo Cucina Italiana,Pizza was good.Chefs antipasti antipasti selec...,0
3,Kolkata,Restaurants,Ottimo Cucina Italiana,It was a wonderful dinning experience in the r...,0
4,Kolkata,Restaurants,Ottimo Cucina Italiana,Had a really great time with family.\nFood was...,0


In [54]:
#creating new dataframe fake_data and copying only the reviews which are fake
fake_data = df.iloc[np.where(df['Fake'] == 1)]
fake_data

Unnamed: 0,city_name,place_to_visit,place_name,reviews,Fake
6,Kolkata,Restaurants,Ottimo Cucina Italiana,"Very nice place must visit , interiors are goo...",1
7,Kolkata,Restaurants,Ottimo Cucina Italiana,Great to be announced that this was the best p...,1
9,Kolkata,Restaurants,Ottimo Cucina Italiana,The food was amazing and so fantastically pres...,1
20,Kolkata,Restaurants,Ottimo Cucina Italiana,Had a great experience at the Ottimo. We were ...,1
22,Kolkata,Restaurants,Ottimo Cucina Italiana,Excellent food. Very nice and kind waiters. Wo...,1
...,...,...,...,...,...
98534,Agra,Hotels,Trident,It was the annual family break that we took to...,1
98540,Agra,Hotels,Trident,Top notch hospitality. Each & every staff is ...,1
98543,Agra,Hotels,Trident,Very good Hotel in Agra. Nice and clean room.G...,1
98563,Agra,Hotels,Trident,It’s was a great experience I just loved the a...,1


In [59]:
df.to_csv('powerbi2.csv')