## SETUP

In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-12-17 15:53:28--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-12-17 15:53:29--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-12-17 15:53:29--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [None]:
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
spark = sparknlp.start()

In [None]:
import re
def preprocessor(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
  return text

## PER GENRE

Load IMDB data per genre

In [None]:
import pandas as pd
import glob
path = r'/content/drive/MyDrive/review_file' # use your path
all_files = glob.glob(path + "/*.csv")

In [None]:
df_inputs = {}
for i in range(len(all_files)):
  df_inputs[i] = pd.read_csv(all_files[i])

Clean IMDB data

In [None]:
data_cleaned = {}
for i in range(len(all_files)):
  data = df_inputs[i]
  data['remove_mentions'] = data['text'].str.replace('@\S+','')
  stop = stopwords.words('english')
  print(data["remove_mentions"])
  data['remove_stop'] = data['remove_mentions'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in stop))
  data['final_reviews'] = data["remove_stop"].apply(lambda x: preprocessor(x))
  cleaned_df = pd.DataFrame()
  cleaned_df["text"] = data["final_reviews"]
  data_cleaned[i] = cleaned_df

0                         a magnificent accomplishment."
1      he managed to save about 1100 Jews from being ...
2       while at the same time actually bringing ligh...
3       Spielberg himself has admitted that he tends ...
4       I think he'll be remembered for generations t...
                             ...                        
869     I am not saying in any way that this was a me...
870                                                 i.e.
871     and so on...but this is not a film about two ...
872     but maybe Ted Turner will put into color for ...
873     especially in his final breakdown scene.Ralph...
Name: remove_mentions, Length: 874, dtype: object
0        having every single detail of his work done i...
1        something no one has ever done before : inste...
2        was not so different than our hero Cobb. They...
3        pun intended. The movie explains (or at least...
4                                  Mal (Marion Cotillard)
                              ...

SentimentAnalysis

In [None]:
MODEL_NAME='sentimentdl_use_imdb'
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]



KeyboardInterrupt



In [None]:
result = {}
for i in range(len(all_files)):
  empty_df = spark.createDataFrame([['']]).toDF("text")
  pipelineModel = nlpPipeline.fit(empty_df)
  df = spark.createDataFrame(data_cleaned[i])
  result[i] = pipelineModel.transform(df)

In [None]:
from pyspark.sql.functions import countDistinct
for i in range(len(all_files)):
  save_res = result[i].select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).select(F.expr("cols['1']").alias("sentiment")).groupBy("sentiment").count()
  res_final = save_res.toDF("sentiment", "count")
  pandas_res_final = res_final.toPandas()
  k = all_files[i][all_files[i].rfind('/')+1:-11]
  pandas_res_final.to_csv('/content/drive/MyDrive/IMDB_genre_sentiments_final/Sentiments'+k+'.csv')

Load Twitter data per genre

In [None]:
import pandas as pd
import glob
path = r'/content/drive/MyDrive/Movies-tweets/Sentiments' # use your path
all_files = glob.glob(path + "/*.csv")

In [None]:
df_inputs = {}
for i in range(len(all_files)):
  df_inputs[i] = pd.read_csv(all_files[i],lineterminator='\n')

Clean Twitter data

In [None]:
data_cleaned = {}
for i in range(len(all_files)):
  data = df_inputs[i]
  data['remove_mentions'] = data['tweets'].str.replace('@\S+','')
  stop = stopwords.words('english')
  data['remove_stop'] = data['remove_mentions'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in stop))
  data['final_reviews'] = data["remove_stop"].apply(lambda x: preprocessor(x))
  cleaned_df = pd.DataFrame()
  cleaned_df["text"] = data["final_reviews"]
  data_cleaned[i] = cleaned_df

In [None]:
data_cleaned[0].to_csv("tweets_cleaned")

SentimentAnalysis

In [None]:
MODEL_NAME='sentimentdl_use_twitter'
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[OK!]


In [None]:
result = {}
for i in range(len(all_files)):
  empty_df = spark.createDataFrame([['']]).toDF("text")
  pipelineModel = nlpPipeline.fit(empty_df)
  df = spark.createDataFrame(data_cleaned[i])
  result[i] = pipelineModel.transform(df)

In [None]:
from pyspark.sql.functions import countDistinct
for i in range(len(all_files)):
  save_res = result[i].select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).select(F.expr("cols['1']").alias("sentiment")).groupBy("sentiment").count()
  res_final = save_res.toDF("sentiment", "count")
  pandas_res_final = res_final.toPandas()
  k = all_files[i][all_files[i].rfind('/')+1:-4]
  pandas_res_final.to_csv('/content/drive/MyDrive/Twitter_genre_sentiment_final/Sentiments'+k+'.csv')

## PER MOVIE

Load IMDB data per movie

In [None]:
import pandas as pd
import glob
path = r'/content/drive/MyDrive/imdb_dataset/2_reviews_per_movie_raw' # use your path
all_files = glob.glob(path + "/*.csv")

In [None]:
df_inputs = {}
for i in range(len(all_files)):
  df_inputs[i] = pd.read_csv(all_files[i])

Clean IMDB data

In [None]:
data_cleaned = {}
for i in range(len(all_files)):
  data = pd.DataFrame()
  data['review'] = df_inputs[i]['review']
  # data = df_inputs[i]
  data['remove_mentions'] = data['review'].str.replace('@\S+','')
  stop = stopwords.words('english')
  # print(data["remove_mentions"])
  data['remove_stop'] = data['remove_mentions'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in stop))
  data['final_reviews'] = data["remove_stop"].apply(lambda x: preprocessor(x))
  cleaned_df = pd.DataFrame()
  cleaned_df["text"] = data["final_reviews"]
  data_cleaned[i] = cleaned_df

KeyboardInterrupt: ignored

In [None]:
data_cleaned[0].to_csv("tweets_per_movie_cleaned")

SentimentAnalysis

In [None]:
MODEL_NAME='sentimentdl_use_imdb'
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_imdb download started this may take some time.
Approximate size to download 12 MB
[OK!]


In [None]:
result = {}
for i in range(len(all_files)):
  empty_df = spark.createDataFrame([['']]).toDF("text")
  pipelineModel = nlpPipeline.fit(empty_df)
  df = spark.createDataFrame(data_cleaned[i])
  result[i] = pipelineModel.transform(df)

In [None]:
from pyspark.sql.functions import countDistinct
for i in range(len(all_files)):
  save_res = result[i].select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).select(F.expr("cols['1']").alias("sentiment")).groupBy("sentiment").count()
  res_final = save_res.toDF("sentiment", "count")
  pandas_res_final = res_final.toPandas()
  k = all_files[i][all_files[i].rfind('/')+1:-4]
  pandas_res_final.to_csv('/content/drive/MyDrive/imdb_dataset/3_sentiments_per_movie/'+k+'.csv')

Load Twitter data per movie

In [None]:
import os
all_files = []
for root, subdirectories, files in os.walk('/content/drive/MyDrive/Movies-tweets'):
    # for subdirectory in subdirectories:
    #   print(subdirectory)      
    for file in files:
        if root[root.rfind('/')+1:] not in ['.ipynb_checkpoints','SentAnalysed','Sentiments']:
          all_files.append(os.path.join(root, file))
    #   print(root[root.rfind('/')+1:])
      # print(os.path.join(root, file))

In [None]:
df_inputs = {}
for i in range(len(all_files)):
  print(all_files[i])
  df_inputs[i] = pd.read_csv(all_files[i],lineterminator='\n')

/content/drive/MyDrive/Movies-tweets/SciFi/Inception.csv
/content/drive/MyDrive/Movies-tweets/SciFi/Iron Man.csv
/content/drive/MyDrive/Movies-tweets/SciFi/The Matrix.csv
/content/drive/MyDrive/Movies-tweets/SciFi/Back to the Future.csv
/content/drive/MyDrive/Movies-tweets/SciFi/The Avengers.csv
/content/drive/MyDrive/Movies-tweets/SciFi/V for Vendetta.csv
/content/drive/MyDrive/Movies-tweets/SciFi/The Truman Show.csv
/content/drive/MyDrive/Movies-tweets/SciFi/Terminator 2_ Judgment Day.csv
/content/drive/MyDrive/Movies-tweets/SciFi/The Prestige.csv
/content/drive/MyDrive/Movies-tweets/SciFi/Interstellar.csv
/content/drive/MyDrive/Movies-tweets/SciFi/Eternal Sunshine of the Spotless Mind.csv
/content/drive/MyDrive/Movies-tweets/SciFi/Star Wars_ Episode VII - The Force Awakens.csv
/content/drive/MyDrive/Movies-tweets/SciFi/Jurassic Park.csv
/content/drive/MyDrive/Movies-tweets/SciFi/The Terminator.csv
/content/drive/MyDrive/Movies-tweets/SciFi/The Hunger Games.csv
/content/drive/MyDrive

files ignored:
/content/drive/MyDrive/Movies-tweets/Comedy/American Pie.csv

Clean Twitter data

In [None]:
data_cleaned = {}
for i in range(len(all_files)):
  data = pd.DataFrame()
  data['review'] = df_inputs[i]['tweets']
  # data = df_inputs[i]
  data['remove_mentions'] = data['review'].str.replace('@\S+','')
  stop = stopwords.words('english')
  # print(data["remove_mentions"])
  data['remove_stop'] = data['remove_mentions'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in stop))
  data['final_reviews'] = data["remove_stop"].apply(lambda x: preprocessor(x))
  cleaned_df = pd.DataFrame()
  cleaned_df["text"] = data["final_reviews"]
  data_cleaned[i] = cleaned_df

In [None]:
data_cleaned[0]

Unnamed: 0,text
0,どの階層にいてもいつキックが来るか分かるようにしておかないといけないな キックする時 音楽...
1,1 3 na versão antiga tinha uma cena onde sam p...
2,rt songs close milestones hala hala 39 98m 40m...
3,rt christian eschatology core nazi ideology si...
4,rt baby teenage fuse know onf lore barcodes on...
...,...
366,rt incredible growth since inception 1989 refl...
367,rt you dream live in dream never awake from at...
368,inception type shit
369,playlist shuffle inception starts playing auto...


SentimentAnalysis

In [None]:
MODEL_NAME='sentimentdl_use_twitter'
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[OK!]


In [None]:
result = {}
for i in range(len(all_files)):
  empty_df = spark.createDataFrame([['']]).toDF("text")
  pipelineModel = nlpPipeline.fit(empty_df)
  df = spark.createDataFrame(data_cleaned[i])
  result[i] = pipelineModel.transform(df)

In [None]:
from pyspark.sql.functions import countDistinct
for i in range(len(all_files)):
  save_res = result[i].select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).select(F.expr("cols['1']").alias("sentiment")).groupBy("sentiment").count()
  res_final = save_res.toDF("sentiment", "count")
  pandas_res_final = res_final.toPandas()
  k = all_files[i][all_files[i].rfind('/')+1:-4]
  pandas_res_final.to_csv('/content/drive/MyDrive/twitter_per_movie_sentiments/'+k+'.csv')

Twitter new movies:

In [None]:
import os
all_files = []
for root, subdirectories, files in os.walk('/content/drive/MyDrive/twitter_latest_movie_reviews'):
    for file in files:
        if root[root.rfind('/')+1:] not in ['.ipynb_checkpoints','SentAnalysed','Sentiments']:
          all_files.append(os.path.join(root, file))

In [None]:
df_inputs = {}
for i in range(len(all_files)):
  print(all_files[i])
  df_inputs[i] = pd.read_csv(all_files[i],lineterminator='\n')

/content/drive/MyDrive/twitter_latest_movie_reviews/spiderman.csv
/content/drive/MyDrive/twitter_latest_movie_reviews/quietplace.csv
/content/drive/MyDrive/twitter_latest_movie_reviews/notimetodie.csv
/content/drive/MyDrive/twitter_latest_movie_reviews/dune.csv
/content/drive/MyDrive/twitter_latest_movie_reviews/bw.csv


In [None]:
data_cleaned = {}
for i in range(len(all_files)):
  data = pd.DataFrame()
  data['review'] = df_inputs[i]['tweets']
  # data = df_inputs[i]
  data['remove_mentions'] = data['review'].str.replace('@\S+','')
  stop = stopwords.words('english')
  # print(data["remove_mentions"])
  data['remove_stop'] = data['remove_mentions'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in stop))
  data['final_reviews'] = data["remove_stop"].apply(lambda x: preprocessor(x))
  cleaned_df = pd.DataFrame()
  cleaned_df["text"] = data["final_reviews"]
  data_cleaned[i] = cleaned_df

In [None]:
MODEL_NAME='sentimentdl_use_twitter'
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[OK!]


In [None]:
result = {}
for i in range(len(all_files)):
  empty_df = spark.createDataFrame([['']]).toDF("text")
  pipelineModel = nlpPipeline.fit(empty_df)
  df = spark.createDataFrame(data_cleaned[i])
  result[i] = pipelineModel.transform(df)

In [None]:
from pyspark.sql.functions import countDistinct
for i in range(len(all_files)):
  save_res = result[i].select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).select(F.expr("cols['1']").alias("sentiment")).groupBy("sentiment").count()
  res_final = save_res.toDF("sentiment", "count")
  pandas_res_final = res_final.toPandas()
  k = all_files[i][all_files[i].rfind('/')+1:-4]
  pandas_res_final.to_csv('/content/drive/MyDrive/twitter_latest_movie_reviews/'+k+'_twitter_sentiment.csv')