In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import time

begin = time.time()

In [3]:
import pandas as pd
import numpy as np

# Importing necessary libraries

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Team13_Project/Fake_DataSet/reviews.csv')

# importing dataset consisting of reviews scrapped from website.

In [5]:
df.columns

#Displaying name of Columns.

Index(['Unnamed: 0', 'city_name', 'place_to_visit', 'place_name', 'reviews'], dtype='object')

In [6]:
df.drop(['Unnamed: 0'] , axis = 1 , inplace = True)

#Dropping Unnecessary Columns.

In [7]:
df.head()

# Displaying Top 5 Records.

Unnamed: 0,city_name,place_to_visit,place_name,reviews
0,Kolkata,Restaurants,Ottimo Cucina Italiana,Mr.Prakash Kumar and Mr.Sourav both are one on...
1,Kolkata,Restaurants,Ottimo Cucina Italiana,Absolutely great place for gathering and indee...
2,Kolkata,Restaurants,Ottimo Cucina Italiana,Pizza was good.Chefs antipasti antipasti selec...
3,Kolkata,Restaurants,Ottimo Cucina Italiana,It was a wonderful dinning experience in the r...
4,Kolkata,Restaurants,Ottimo Cucina Italiana,Had a really great time with family.\nFood was...


In [8]:
df.shape

#Checking Number of rows and Columns

(98623, 4)

In [9]:
df.info()

#checking null values and datatypes of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98623 entries, 0 to 98622
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   city_name       98623 non-null  object
 1   place_to_visit  98623 non-null  object
 2   place_name      98623 non-null  object
 3   reviews         98623 non-null  object
dtypes: object(4)
memory usage: 3.0+ MB


In [10]:
# counting values in column city_name
df.city_name.value_counts()

Goa           9043
Ahmedabad     9022
Bangalore     9000
Delhi         9000
Udaipur       8999
Kolkata       8997
Mumbai        8978
Chandigarh    8919
Jaipur        8700
Pune          8690
Agra          6572
Varanasi      1465
agra           600
city_name      338
jaipur         300
Name: city_name, dtype: int64

In [11]:
df = df.drop(df.iloc[np.where(df.city_name == 'city_name')].index , axis = 0)


In [12]:
# replacing the value names to avoid duplicacy in the data.

df = df.replace('jaipur' , 'Jaipur')
df = df.replace('agra' , 'Agra')
df = df.replace('Things_to_do' , 'Things')

In [13]:
# copied the contents of reviews column in new dataframe messages
messages = pd.DataFrame(df.reviews)

In [14]:
# created a new column label with values 0
messages['label'] = 0

In [15]:
messages = pd.DataFrame(messages , columns = ['label' , 'reviews'])

In [16]:
#displaying top 5 records of messages
messages.head()

Unnamed: 0,label,reviews
0,0,Mr.Prakash Kumar and Mr.Sourav both are one on...
1,0,Absolutely great place for gathering and indee...
2,0,Pizza was good.Chefs antipasti antipasti selec...
3,0,It was a wonderful dinning experience in the r...
4,0,Had a really great time with family.\nFood was...


In [17]:
# changing name of reviews column to text
messages.columns = ['label' , 'text']

In [18]:
messages.reset_index(drop = True , inplace  = True)

In [19]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98285 entries, 0 to 98284
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   98285 non-null  int64 
 1   text    98285 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [20]:
# Data Cleaning and Preprocessing

# Importing Necessary Libraries
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [21]:
#To get the progress bar

from tqdm import tqdm , trange

#Creating Object of WordNet Lemmatizer
wordnet = WordNetLemmatizer()
corpus = [] #Created a List named corpus.

for i in trange(0,len(messages)):

    review = re.sub('[^a-zA-Z]' , ' ' , messages['text'][i]) # Using Regex Function to replace digits and Special chracters with space
    review = review.lower() #converting all the reviews into lower case
    review = review.split() # splitting the reviews (by default space)
    
    # using list comprehension to compare the words which are not in stopword english library and appending them into list corpus.
    review = [wordnet.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

100%|██████████| 98285/98285 [12:15<00:00, 133.59it/s]


In [22]:
#copied the data of list copus into text column
messages.text = corpus

In [23]:
#displaying top 5 records of messages
messages.head()

Unnamed: 0,label,text
0,0,mr prakash kumar mr sourav one best staff rest...
1,0,absolutely great place gathering indeed excell...
2,0,pizza good chef antipasti antipasti selection ...
3,0,wonderful dinning experience restaurant especi...
4,0,really great time family food really good soo ...


In [24]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98285 entries, 0 to 98284
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   98285 non-null  int64 
 1   text    98285 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [25]:
#Converting file to csv format
messages.to_csv('messages.csv' , index = False)

In [26]:
df.head()

Unnamed: 0,city_name,place_to_visit,place_name,reviews
0,Kolkata,Restaurants,Ottimo Cucina Italiana,Mr.Prakash Kumar and Mr.Sourav both are one on...
1,Kolkata,Restaurants,Ottimo Cucina Italiana,Absolutely great place for gathering and indee...
2,Kolkata,Restaurants,Ottimo Cucina Italiana,Pizza was good.Chefs antipasti antipasti selec...
3,Kolkata,Restaurants,Ottimo Cucina Italiana,It was a wonderful dinning experience in the r...
4,Kolkata,Restaurants,Ottimo Cucina Italiana,Had a really great time with family.\nFood was...


In [27]:
end = time.time()
print(end - begin)

746.4713091850281
