In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import langid
import string
import re
from googletrans import Translator
from autocorrect import Speller

In [29]:
#Reading the data base
# Connect to the SQLite database
conn = sqlite3.connect("IMDB_Movies_2021.db")

# Automatically get the first table name in the database
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_name = cursor.fetchone()[0]  # Fetch the first table name
print(f"Found table: {table_name}")

# Query all data from the first table
cursor.execute(f"SELECT * FROM {table_name}")
data = cursor.fetchall()  # Fetch all rows
column_names = [desc[0] for desc in cursor.description]  # Extract column names

# Convert data to a NumPy array
np_data = np.array(data, dtype=object)  # Use dtype=object to handle mixed data types
df = pd.DataFrame(np_data, columns=column_names)
print(df.head())
# Display the first few rows


Found table: REVIEWS
  ID                                             REVIEW RATING  \
0  1  I don't get all the terrible reviews for this ...      5   
1  2  I cannot believe anyone could give this film l...      8   
2  3  Great White is not the worst way to spend 90 m...      4   
3  4  Great White is as basic of a killer shark film...      4   
4  5  Terrible story, dialogue and CGI. The film has...      4   

                 AUTHOR                                              TITLE  
0       margarida-44311                                          Not Bad\n  
1              joemay-2   What are all the bad reviews about is it a wo...  
2                  nebk                            Great White=Jaws Lite\n  
3             kuarinofu                     Bare-bones killer shark film\n  
4  Horror_Flick_Fanatic                Terrible story, dialogue, and CGI\n  


In [4]:
#How many enteries in the data base
len(df)

5450

In [5]:
#How many enteries have nan in them
nan_values = df[df.isnull().any(axis=1)]
print(len(nan_values))

118


In [30]:
#How many nan enteries in each col
for col in ["ID", "REVIEW", "RATING", "AUTHOR", "TITLE"]:
    print(f"Missing values in {col}: {df[col].isnull().sum()}")


Missing values in ID: 0
Missing values in REVIEW: 0
Missing values in RATING: 118
Missing values in AUTHOR: 0
Missing values in TITLE: 0


In [7]:
#Delete all rows with nan in
df_delete = df.dropna()

In [8]:
#Replace all nan values with mean
#rating_mean = df["RATING"].mean()
#df_mean = df.replace(isnull() == rating_mean)
#df_mean = df["RATING"].where(~df["RATING"].isnull(), rating_mean)

In [31]:
#Duplicates
num_duplicates = df.duplicated().sum()
print(num_duplicates)

0


In [32]:
#mean of ratings
df["RATING"].mean()

5.2426856714178545

In [11]:
#seed? - might be needed to keep data set the same

#heatmap for correlations?

In [42]:
#TEXT PREPROCESSING
rating = df["REVIEW"]
#for i in rating: print(langid.classify(i))
print(rating.head())
#REGEX
punct = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) #space equal to punct remove '!!!' = '   ' 
rating = rating.str.translate(punct)
non_eng_indices, trans_text = [],[]
for i in range(len(rating)):
    lang, prob = langid.classify(rating[i])
    if lang != 'en':
        non_eng_indices.append(i)

translator = Translator()
spell = Speller()

for sentence in non_eng_indices:
    translated_text = translator.translate(rating[sentence], dest='en')
    trans_text.append(translated_text.text)

for index, translated_sentence in zip(non_eng_indices, trans_text):
    rating[index] = translated_sentence.replace("\n", " ")

rating = rating.str.lower()
char_bank = string.ascii_letters + string.digits + string.punctuation #'normal' char if you want to remove punctuation just del
rating = rating.str.replace(rf'[^' + re.escape(char_bank) + r']', ' ', regex=True) #replace non bank char
rating = rating.str.replace(r'\s+', ' ', regex=True).str.strip() #multiple white spaces to one
rating = rating.apply(spell) #autocorrect
print(rating[546])
print(rating[584])
#maybe lemmetization, numbers to words, removing filler words

0    I don't get all the terrible reviews for this ...
1    I cannot believe anyone could give this film l...
2    Great White is not the worst way to spend 90 m...
3    Great White is as basic of a killer shark film...
4    Terrible story, dialogue and CGI. The film has...
Name: REVIEW, dtype: object
goonies wanna be goonies wanna be goonies wanna be goonies wanna be
tried to give it a chance but sorry it wasn t worth to spend 2 hours watching the movie          
