## SETUP

In [1]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-12-03 04:08:43--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-12-03 04:08:44--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-12-03 04:08:44--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [2]:
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from google.colab import drive

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
spark = sparknlp.start()

In [6]:
import re
def preprocessor(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
  return text

## PER GENRE

Load IMDB data per genre

In [None]:
import pandas as pd
import glob
path = r'/content/drive/MyDrive/review_file' # use your path
all_files = glob.glob(path + "/*.csv")

In [None]:
df_inputs = {}
for i in range(len(all_files)):
  df_inputs[i] = pd.read_csv(all_files[i])

Clean IMDB data

In [None]:
data_cleaned = {}
for i in range(len(all_files)):
  data = df_inputs[i]
  data['remove_mentions'] = data['text'].str.replace('@\S+','')
  stop = stopwords.words('english')
  print(data["remove_mentions"])
  data['remove_stop'] = data['remove_mentions'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in stop))
  data['final_reviews'] = data["remove_stop"].apply(lambda x: preprocessor(x))
  cleaned_df = pd.DataFrame()
  cleaned_df["text"] = data["final_reviews"]
  data_cleaned[i] = cleaned_df

SentimentAnalysis

In [None]:
MODEL_NAME='sentimentdl_use_imdb'
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_imdb download started this may take some time.
Approximate size to download 12 MB
[OK!]


In [None]:
result = {}
for i in range(len(all_files)):
  empty_df = spark.createDataFrame([['']]).toDF("text")
  pipelineModel = nlpPipeline.fit(empty_df)
  df = spark.createDataFrame(data_cleaned[i])
  result[i] = pipelineModel.transform(df)

In [None]:
from pyspark.sql.functions import countDistinct
for i in range(len(all_files)):
  save_res = result[i].select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).select(F.expr("cols['1']").alias("sentiment")).groupBy("sentiment").count()
  res_final = save_res.toDF("sentiment", "count")
  pandas_res_final = res_final.toPandas()
  k = all_files[i][all_files[i].rfind('/')+1:-11]
  pandas_res_final.to_csv('/content/drive/MyDrive/IMDB_genre_sentiments_final/Sentiments'+k+'.csv')

Load Twitter data per genre

In [None]:
import pandas as pd
import glob
path = r'/content/drive/MyDrive/Movies-tweets/Sentiments' # use your path
all_files = glob.glob(path + "/*.csv")

In [None]:
df_inputs = {}
for i in range(len(all_files)):
  df_inputs[i] = pd.read_csv(all_files[i],lineterminator='\n')

Clean Twitter data

In [None]:
data_cleaned = {}
for i in range(len(all_files)):
  data = df_inputs[i]
  data['remove_mentions'] = data['tweets'].str.replace('@\S+','')
  stop = stopwords.words('english')
  data['remove_stop'] = data['remove_mentions'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in stop))
  data['final_reviews'] = data["remove_stop"].apply(lambda x: preprocessor(x))
  cleaned_df = pd.DataFrame()
  cleaned_df["text"] = data["final_reviews"]
  data_cleaned[i] = cleaned_df

SentimentAnalysis

In [None]:
MODEL_NAME='sentimentdl_use_twitter'
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[OK!]


In [None]:
result = {}
for i in range(len(all_files)):
  empty_df = spark.createDataFrame([['']]).toDF("text")
  pipelineModel = nlpPipeline.fit(empty_df)
  df = spark.createDataFrame(data_cleaned[i])
  result[i] = pipelineModel.transform(df)

In [None]:
from pyspark.sql.functions import countDistinct
for i in range(len(all_files)):
  save_res = result[i].select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).select(F.expr("cols['1']").alias("sentiment")).groupBy("sentiment").count()
  res_final = save_res.toDF("sentiment", "count")
  pandas_res_final = res_final.toPandas()
  k = all_files[i][all_files[i].rfind('/')+1:-4]
  pandas_res_final.to_csv('/content/drive/MyDrive/Twitter_genre_sentiment_final/Sentiments'+k+'.csv')

## PER MOVIE

Load IMDB data per movie

In [7]:
import pandas as pd
import glob
path = r'/content/drive/MyDrive/imdb_dataset/2_reviews_per_movie_raw' # use your path
all_files = glob.glob(path + "/*.csv")

In [8]:
df_inputs = {}
for i in range(len(all_files)):
  df_inputs[i] = pd.read_csv(all_files[i])

Clean IMDB data

In [25]:
data_cleaned = {}
for i in range(len(all_files)):
  data = pd.DataFrame()
  data['review'] = df_inputs[i]['review']
  # data = df_inputs[i]
  data['remove_mentions'] = data['review'].str.replace('@\S+','')
  stop = stopwords.words('english')
  # print(data["remove_mentions"])
  data['remove_stop'] = data['remove_mentions'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in stop))
  data['final_reviews'] = data["remove_stop"].apply(lambda x: preprocessor(x))
  cleaned_df = pd.DataFrame()
  cleaned_df["text"] = data["final_reviews"]
  data_cleaned[i] = cleaned_df

SentimentAnalysis

In [28]:
MODEL_NAME='sentimentdl_use_imdb'
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_imdb download started this may take some time.
Approximate size to download 12 MB
[OK!]


In [29]:
result = {}
for i in range(len(all_files)):
  empty_df = spark.createDataFrame([['']]).toDF("text")
  pipelineModel = nlpPipeline.fit(empty_df)
  df = spark.createDataFrame(data_cleaned[i])
  result[i] = pipelineModel.transform(df)

In [33]:
from pyspark.sql.functions import countDistinct
for i in range(len(all_files)):
  save_res = result[i].select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).select(F.expr("cols['1']").alias("sentiment")).groupBy("sentiment").count()
  res_final = save_res.toDF("sentiment", "count")
  pandas_res_final = res_final.toPandas()
  k = all_files[i][all_files[i].rfind('/')+1:-4]
  pandas_res_final.to_csv('/content/drive/MyDrive/imdb_dataset/3_sentiments_per_movie/'+k+'.csv')

Load Twitter data per genre

Clean Twitter data

SentimentAnalysis