<a href="https://colab.research.google.com/github/bpayton0101/AAI-520-Final-Project/blob/main/002_0_data_cleaning_movie_lines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# The Cornell Movie Dialogs dataset is a popular resource for natural language
# processing tasks.
# However, it may require cleaning before use due to potential inconsistencies, errors, or unwanted elements.
# Here are some Python code recommendations to help you clean this dataset:


# 1. Import Necessary Libraries:

import pandas as pd
import re

# 2. Load the Dataset:

from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/movie_lines.txt"  # Replace with your file's path

# Try opening the file with different encodings
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
except UnicodeDecodeError:
    try:
        with open(file_path, 'r', encoding='latin-1') as f:
            text = f.read()
    except UnicodeDecodeError:
        try:
            with open(file_path, 'r', encoding='cp1252') as f:
                text = f.read()
        except:
            print("Could not decode file. Please try a different encoding.")

# Read the TXT file's contents into a pandas DataFrame
# use the read_csv function to read the data, specifying the delimiter and encoding
# If you encounter errors, you may need to experiment with different delimiters
# or handle bad lines
try:
  text_data = pd.read_csv(file_path, delimiter="\t", encoding='latin-1', on_bad_lines='skip', header=None, names=["character", "line"]) # Pass the file_path to read_csv
except pd.errors.ParserError as e:
  print(f"ParserError: {e}")
  # If you encounter a ParserError, try a different delimiter
  try:
    text_data = pd.read_csv(file_path, delimiter=" +++$+++ ", encoding='latin-1', on_bad_lines='skip', header=None, names=["character", "line"]) # Pass the file_path to read_csv
  except pd.errors.ParserError as e:
    print(f"ParserError: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 2a. Display Data
new_var = display(text_data)
new_var


Unnamed: 0,character,line
0,L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++...,
1,L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON ++...,
2,L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$...,
3,L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++...,
4,L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$...,
...,...,...
304600,L666371 +++$+++ u9030 +++$+++ m616 +++$+++ DUR...,
304601,L666370 +++$+++ u9034 +++$+++ m616 +++$+++ VER...,
304602,L666369 +++$+++ u9030 +++$+++ m616 +++$+++ DUR...,
304603,L666257 +++$+++ u9030 +++$+++ m616 +++$+++ DUR...,


In [None]:

# 2b. Pre-Processing Count words and characters
word_count = text_data["line"].dropna().str.split().apply(len).sum() # Added dropna() to remove missing values
char_count = text_data["line"].dropna().str.len().sum() # Added dropna() to remove missing values

# Print the results
print("Total words:", word_count)
print("Total characters:", char_count)

Total words: 698
Total characters: 3695


In [None]:
# 3. Handle Missing Values:
text_data.dropna(inplace=True)  # Remove rows with missing values

In [None]:
# 4. Remove Duplicate Rows:
text_data.drop_duplicates(inplace=True)  # Remove duplicate rows

In [None]:
# 5. Clean Text Data:

def clean_text(text):
    # Remove punctuation and special characters
    text = re.sub(r"[^\w\s]", "", text)
    # Convert text to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = text.strip()
    return text

text_data["line"] = text_data["line"].apply(clean_text)

In [None]:
# 6. Remove Stop Words:

!pip install nltk
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

text_data["line"] = text_data["line"].apply(remove_stopwords)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 7. Handle Contractions:

!pip install contractions # install the missing contractions library
import contractions

text_data["line"] = text_data["line"].apply(contractions.fix)

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K

In [None]:
# 8. Stem or Lemmatize:

from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_text(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Choose either stemming or lemmatization:
text_data["line"] = text_data["line"].apply(stem_text)
# Download wordnet before using lemmatizer
nltk.download('wordnet')
text_data["line"] = text_data["line"].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# 9. To remove characters like " +++$+++"
# Remove " +++$+++" and any leading/trailing whitespace
text_data["line"] = text_data["line"].str.replace(r"^\s*\+\+\+ \$\ \+\+\+\s*$", "", regex=True)
                                                    # Escaped the + symbols with backslashes

In [None]:
# 10. Post Processing Count words and characters
word_count = text_data["line"].dropna().str.split().apply(len).sum() # Added dropna() to remove missing values
char_count = text_data["line"].dropna().str.len().sum() # Added dropna() to remove missing values

# Print the results
print("Total words:", word_count)
print("Total characters:", char_count)

Total words: 380
Total characters: 2038


In [None]:
# 11. Save file processed in cell above as 'cleaned_text_data' to my google
# drive

from google.colab import files

# Assuming 'text_data' is your DataFrame containing the cleaned data
text_data.to_csv('/content/drive/MyDrive/cleaned_text_data.csv', index=False)


In [None]:
# prompt: save file in above cell as txt file

# Assuming 'text_data' is your DataFrame containing the cleaned data
text_data.to_csv('/content/drive/MyDrive/cleaned_text_data.txt', index=False, sep='\t')


In [None]:
def combine_lines_from_txt_file(file_path):
  """Combines lines from a TXT file into a single unstructured string.

  Args:
    file_path: The path to the TXT file.

  Returns:
    A string containing all the lines from the file combined into a single string.
  """

  with open(file_path, 'r') as f:
    combined_text = ''.join(f.readlines())
    combined_text = combined_text.replace('\n', '')  # Remove newline characters
  return combined_text

# Example usage:
file_path = "/content/drive/MyDrive/cleaned_text_data.txt"  # Replace with the actual path to your TXT file
unstructured_data = combine_lines_from_txt_file(file_path)
print(unstructured_data)

character	lineL229706 +++$+++ u1042 +++$+++ m68 +++$+++ JASON +++$+++ BRANDON!	time goL229857 +++$+++ u1041 +++$+++ m68 +++$+++ GWEN +++$+++ All systems are working, Commander. ,~ -cc	pink cL229881 +++$+++ u1049 +++$+++ m68 +++$+++ TOMMY +++$+++ I see them!  I see them! RD STREET	pasadena 57L229801 +++$+++ u1041 +++$+++ m68 +++$+++ GWEN +++$+++ What are you doing? What are thev doino? ~7C INT. SARRIS' SHIP	h37cL237881 +++$+++ u1117 +++$+++ m73 +++$+++ REDBEARD +++$+++ It would have been a beautiful bridge, John. I never noticed before, occupied with other business, I	suppose never really pay much attention kind thing ive time today nothing else graceful placement couldnt prettierL381472 +++$+++ u1914 +++$+++ m125 +++$+++ ZED'S VOICE +++$+++ We've got about eight or nine prospects	want lookL496383 +++$+++ u2667 +++$+++ m174 +++$+++ AUGUST +++$+++ That's what I tried to find out.	went la jeunesse used phony health inspectors badge let go works one room room locked id like see inside room

In [None]:
display(unstructured_data)

"character\tlineL229706 +++$+++ u1042 +++$+++ m68 +++$+++ JASON +++$+++ BRANDON!\ttime goL229857 +++$+++ u1041 +++$+++ m68 +++$+++ GWEN +++$+++ All systems are working, Commander. ,~ -cc\tpink cL229881 +++$+++ u1049 +++$+++ m68 +++$+++ TOMMY +++$+++ I see them!  I see them! RD STREET\tpasadena 57L229801 +++$+++ u1041 +++$+++ m68 +++$+++ GWEN +++$+++ What are you doing? What are thev doino? ~7C INT. SARRIS' SHIP\th37cL237881 +++$+++ u1117 +++$+++ m73 +++$+++ REDBEARD +++$+++ It would have been a beautiful bridge, John. I never noticed before, occupied with other business, I\tsuppose never really pay much attention kind thing ive time today nothing else graceful placement couldnt prettierL381472 +++$+++ u1914 +++$+++ m125 +++$+++ ZED'S VOICE +++$+++ We've got about eight or nine prospects\twant lookL496383 +++$+++ u2667 +++$+++ m174 +++$+++ AUGUST +++$+++ That's what I tried to find out.\twent la jeunesse used phony health inspectors badge let go works one room room locked id like see in

In [None]:
# prompt: save  unstructured_data to my Google Drive

with open('/content/drive/MyDrive/unstructured_data.txt', 'w') as f:
  f.write(unstructured_data)


In [None]:
word_count = text_data["line"].dropna().str.split().apply(len).sum() # Added dropna() to remove missing values
char_count = text_data["line"].dropna().str.len().sum() # Added dropna() to remove missing values

# Print the results
print("Total words:", word_count)
print("Total characters:", char_count)

Total words: 356
Total characters: 2186
