In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import time
 
# store starting time
begin = time.time()
 

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Kaggle")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [5]:
spark

In [6]:
import pandas as pd

In [7]:
data = pd.read_csv('/content/drive/MyDrive/Team13_Project/Fake_DataSet/reviews.csv')

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,city_name,place_to_visit,place_name,reviews
0,0,Kolkata,Restaurants,Ottimo Cucina Italiana,Mr.Prakash Kumar and Mr.Sourav both are one on...
1,1,Kolkata,Restaurants,Ottimo Cucina Italiana,Absolutely great place for gathering and indee...
2,2,Kolkata,Restaurants,Ottimo Cucina Italiana,Pizza was good.Chefs antipasti antipasti selec...
3,3,Kolkata,Restaurants,Ottimo Cucina Italiana,It was a wonderful dinning experience in the r...
4,4,Kolkata,Restaurants,Ottimo Cucina Italiana,Had a really great time with family.\nFood was...


In [9]:
df = spark.createDataFrame(data)

In [10]:
df.describe()

DataFrame[summary: string, Unnamed: 0: string, city_name: string, place_to_visit: string, place_name: string, reviews: string]

In [11]:
df.count()

98623

In [12]:
df.show()

+----------+---------+--------------+--------------------+--------------------+
|Unnamed: 0|city_name|place_to_visit|          place_name|             reviews|
+----------+---------+--------------+--------------------+--------------------+
|         0|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Mr.Prakash Kumar ...|
|         1|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Absolutely great ...|
|         2|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Pizza was good.Ch...|
|         3|  Kolkata|   Restaurants|Ottimo Cucina Ita...|It was a wonderfu...|
|         4|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Had a really grea...|
|         5|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Excellent food an...|
|         6|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Very nice place m...|
|         7|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Great to be annou...|
|         8|  Kolkata|   Restaurants|Ottimo Cucina Ita...|The food was grea...|
|         9|  Kolkata|   Restaurants|Ott

In [13]:
df = df.drop('Unnamed: 0')

In [14]:
df.show(5)

+---------+--------------+--------------------+--------------------+
|city_name|place_to_visit|          place_name|             reviews|
+---------+--------------+--------------------+--------------------+
|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Mr.Prakash Kumar ...|
|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Absolutely great ...|
|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Pizza was good.Ch...|
|  Kolkata|   Restaurants|Ottimo Cucina Ita...|It was a wonderfu...|
|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Had a really grea...|
+---------+--------------+--------------------+--------------------+
only showing top 5 rows



In [15]:
from pyspark.ml.feature import IDF, Tokenizer , StopWordsRemover
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from pyspark.sql.functions import udf
from pyspark.sql.functions import lower
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import concat_ws


df = df.withColumn("reviews", regexp_replace("reviews", "[^a-zA-Z]", " "))

df = df.withColumn("lower_reviews", lower(df["reviews"]))

tokenizer = Tokenizer(inputCol="lower_reviews", outputCol="tokenize_reviews")

df = tokenizer.transform(df)

stopwords_remover = StopWordsRemover(inputCol="tokenize_reviews", outputCol="filtered_words")

df = stopwords_remover.transform(df)  

wordnet = WordNetLemmatizer()

lemmatize_udf = udf(lambda tokens: [wordnet.lemmatize(token) for token in tokens], ArrayType(StringType()))

df = df.withColumn("lemmatize_reviews", lemmatize_udf(df["filtered_words"]))

df = df.withColumn("reviews_cleaned", concat_ws(" ", "lemmatize_reviews"))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [16]:
df.show()

+---------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|city_name|place_to_visit|          place_name|             reviews|       lower_reviews|    tokenize_reviews|      filtered_words|   lemmatize_reviews|     reviews_cleaned|
+---------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Mr Prakash Kumar ...|mr prakash kumar ...|[mr, prakash, kum...|[mr, prakash, kum...|[mr, prakash, kum...|mr prakash kumar ...|
|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Absolutely great ...|absolutely great ...|[absolutely, grea...|[absolutely, grea...|[absolutely, grea...|absolutely great ...|
|  Kolkata|   Restaurants|Ottimo Cucina Ita...|Pizza was good Ch...|pizza was good ch...|[pizza, was, good...|[pizza, good, che...

In [17]:
df.select('lemmatize_reviews').first()

Row(lemmatize_reviews=['mr', 'prakash', 'kumar', 'mr', 'sourav', 'one', 'best', 'staff', 'restaurant', 'ottimo', '', 'proper', 'knowledge', 'field', 'work', 'professionally', '', 'passionately', 'also', 'make', 'special', 'field', '', 'keep', 'keep', 'rising'])

In [18]:
df.columns

['city_name',
 'place_to_visit',
 'place_name',
 'reviews',
 'lower_reviews',
 'tokenize_reviews',
 'filtered_words',
 'lemmatize_reviews',
 'reviews_cleaned']

In [19]:
df = df.drop('city_name',\
 'place_to_visit',\
 'place_name',\
 'reviews',\
 'lower_reviews',\
 'tokenize_reviews',\
 'filtered_words',\
 'lemmatize_reviews')

In [20]:
df.show()

+--------------------+
|     reviews_cleaned|
+--------------------+
|mr prakash kumar ...|
|absolutely great ...|
|pizza good chef a...|
|wonderful dinning...|
|really great time...|
|excellent food se...|
|nice place must v...|
|great announced b...|
|food great  aweso...|
|food amazing fant...|
|best italian food...|
|visited place ann...|
|nice food   appre...|
|service impeccabl...|
|food great  host ...|
|excellent food se...|
|without doubt pal...|
|good experience o...|
|lovely time  grea...|
|fabulous evening ...|
+--------------------+
only showing top 20 rows



In [21]:
from pyspark.sql.functions import lit

df = df.withColumn("label", lit(0))
df.show()

+--------------------+-----+
|     reviews_cleaned|label|
+--------------------+-----+
|mr prakash kumar ...|    0|
|absolutely great ...|    0|
|pizza good chef a...|    0|
|wonderful dinning...|    0|
|really great time...|    0|
|excellent food se...|    0|
|nice place must v...|    0|
|great announced b...|    0|
|food great  aweso...|    0|
|food amazing fant...|    0|
|best italian food...|    0|
|visited place ann...|    0|
|nice food   appre...|    0|
|service impeccabl...|    0|
|food great  host ...|    0|
|excellent food se...|    0|
|without doubt pal...|    0|
|good experience o...|    0|
|lovely time  grea...|    0|
|fabulous evening ...|    0|
+--------------------+-----+
only showing top 20 rows



In [22]:
df = df.withColumnRenamed("reviews_cleaned", "reviews")

In [23]:
df = df.select('label','reviews')

In [24]:
df.show()

+-----+--------------------+
|label|             reviews|
+-----+--------------------+
|    0|mr prakash kumar ...|
|    0|absolutely great ...|
|    0|pizza good chef a...|
|    0|wonderful dinning...|
|    0|really great time...|
|    0|excellent food se...|
|    0|nice place must v...|
|    0|great announced b...|
|    0|food great  aweso...|
|    0|food amazing fant...|
|    0|best italian food...|
|    0|visited place ann...|
|    0|nice food   appre...|
|    0|service impeccabl...|
|    0|food great  host ...|
|    0|excellent food se...|
|    0|without doubt pal...|
|    0|good experience o...|
|    0|lovely time  grea...|
|    0|fabulous evening ...|
+-----+--------------------+
only showing top 20 rows



In [25]:
new_df = df.toPandas()

In [26]:
new_df.to_csv('messages.csv' , index = False)

In [27]:
end = time.time()
print(end - begin)

127.73031497001648
