In [6]:
# Dela Rosa, Quielle Xyrone, 181603
# Carl Alden Go, 182216
# Margarita Juliana Perez, 183848

# October 19, 2021

# I certify that this submission complies with the DISCS Academic Integrity
# Policy.

# If I have discussed my Python language code with anyone other than
# my instructor(s), my/our groupmate(s), the teaching assistant(s),
# the extent of each discussion has been clearly noted along with a proper
# citation in the comments of my program.

# If any Python language code or documentation used in my program
# was obtained from another source, either modified or unmodified, such as a
# textbook, website, or another individual, the extent of its use has been
# clearly noted along with a proper citation in the comments of my program.

################################################################################

In [7]:
!pip install pickle-mixin nltk sklearn



In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/margauxperez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pattern

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pattern.en import lemma, lexeme
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

stopwords = stopwords.words('english')

In [35]:
#extra filter/stop words in txt file

#filer words are: character names, actor names
with open('remove_words.txt') as file:
    remove_words = file.read().splitlines()

In [36]:
# Read our tweets from the previously created CSV
column_names = ["index", "date", "text", "handle"]
tweets_csv = pd.read_csv('out/tweets.csv', index_col=None, header=0, names=column_names)
tweets_csv['text'].fillna('', inplace=True) #Remove blank texts
tweets = tweets_csv.text.to_list()

### Text cleaning
When cleaning our data, we want to remove unnecessary characters such as punctuations and whitespace. This is so that we can focus solely on the terms found in the text

In [37]:
import re
"""
Remove blank texts, replaces text with lower case characters,
remove special characters, remove leading and trailing
whitespaces, and remove stopwords.
"""
lmtzr = WordNetLemmatizer()

def clean_tweet(tweet):
    
    if type(tweet) == np.float:
        return ""
    
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = re.sub(r'^\s+|\s+$', ' ', temp) #Remove trailing and leading whitespaces
    temp = re.sub(r'[\d-]+', ' ', temp) # Remove numerical chars / integers
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords]
    temp = [w for w in temp if not w in remove_words]
    temp = [lemma(w) for w in temp] #Lemmatization
    temp = " ".join(word for word in temp)
    
    return temp

In [38]:
results = [clean_tweet(tw) for tw in tweets]
dictionary = {'text': tweets, 'cleaned_text': results}

In [39]:
# Export the cleaned tweets into CSV
cleaned_tweets = pd.DataFrame(dictionary)
nan_value = float("NaN")
cleaned_tweets.replace("", nan_value, inplace=True)
cleaned_tweets.dropna(subset = ["cleaned_text"], inplace=True)
cleaned_tweets.to_csv('out/cleaned_tweets.csv', index=False)
cleaned_tweets.head()

Unnamed: 0,text,cleaned_text
0,Finished sex education season 3 didn't like t...,didnt like cliff hanger
1,Currently half way through season 3 of Sex edu...,currently half way absolutely clue addictive w...
3,season 3 of Sex Education has such a good soun...,good soundtrack
4,"@JWRD__ - You season 3 , sex education season ...",film de slag om de schelde
5,on episode 6 season 3 of sex education and i'v...,never want smack anyone


### Text representation
We also want to be able to transform our data from terms into numerals where we can apply quantitative techniques.

1. **Document-term matrix**: occurence of words across documents
2. **N-gram matrix**: occurence of n-grams (phrases of n length) accross documents
3. **TFIDF matrix**: term frequency adjusted by the rarity of the in documents


In [40]:
#bi-gram (2 words in phrase)
def tweets_to_ngram(tweets, n=2):
    vectorizer = CountVectorizer(
        ngram_range=(n, n),
        token_pattern=r'\b\w+\b',
        min_df=1,
        max_features=2000)
    ngram = vectorizer.fit_transform(tweets)
    pickle.dump(vectorizer, open('out/ngram.pk', 'wb'))
    return ngram, vectorizer

def tweets_to_tfidf(tweets):
    vectorizer = TfidfVectorizer(max_features=2000)
    tfidf = vectorizer.fit_transform(tweets)
    pickle.dump(vectorizer, open('out/tfidf.pk', 'wb'))
    return tfidf, vectorizer

In [41]:
# Get bi-gram matrix
ngram, ngram_v = tweets_to_ngram(cleaned_tweets['cleaned_text'], n=2)
print('Ngram matrix shape:', ngram.toarray().shape)

Ngram matrix shape: (1649, 2000)


In [42]:
# Get TFIDF matrix
tfidf, tfidf_v = tweets_to_tfidf(cleaned_tweets['cleaned_text'])
print('TFIDF matrix shape:', tfidf.toarray().shape)

TFIDF matrix shape: (1649, 2000)


### Term frequencies
We can convert our text metrices back into a list terms and their accompanying frequency.  

In [43]:
def vector_to_frequency(vector, vectorizer):
    """
    Return a list of words and their corresponding occurence in the corpus
    """
    total = vector.sum(axis=0)
    frequency = [(w, total[0, i]) for w, i in vectorizer.vocabulary_.items()]
    frequency = pd.DataFrame(frequency, columns=['term', 'frequency'])
    frequency = frequency.sort_values(by='frequency', ascending=False).reset_index(drop=True)
    return frequency

In [44]:
freq_ngram = vector_to_frequency(ngram, ngram_v)
freq_ngram = freq_ngram.set_index('term')
freq_ngram = freq_ngram.reset_index()
freq_ngram.to_csv('out/frequency_ngram.csv', index=False)
freq_ngram.head(10)

Unnamed: 0,term,frequency
0,inclusive intimacy,25
1,get inclusive,25
2,character development,19
3,feel like,18
4,cant wait,14
5,black queer,13
6,queer story,13
7,one best,11
8,best character,10
9,love show,10


In [45]:
freq_tfidf = vector_to_frequency(tfidf, tfidf_v)
freq_tfidf.to_csv('out/frequency_tfidf.csv', index=False)
freq_tfidf.head()

Unnamed: 0,term,frequency
0,good,44.05337
1,get,40.419045
2,like,34.914889
3,show,34.008885
4,love,33.027175
