This notebook is to clean the twitter data/join on the NBA game data so that it can be used for analysis.

In [67]:
import pandas as pd
import datetime as datetime
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import tqdm
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
mavs_df = pd.read_csv('mavs_tweets_all.csv')
mavs_df


In [81]:
tweet_text = mavs_df['text']

Steps to preprocessing the tweet text:
1. Removing @'s
2. Removing links
3. Separating hashtags
4. Removing stop words

In [90]:
#Cleaning Functions

def remove_ats(tw): 
    '''
    Removes @s from a tweet. 
    Improvements: 
       2. if at end of tweet still delete the handle
    '''
    if '@' not in tw: 
        return tw
    
    new_tw = tw
    while '@' in new_tw:
        index_of_at = new_tw.index('@')
        search_range = new_tw[index_of_at:]
        for i, char in enumerate(search_range): 
            if char == ' ': 
                end_index_at = i
                break
            elif i == (len(search_range) - 1): 
                end_index_at = i + 1
                break    
        delete_text = new_tw[index_of_at:end_index_at + index_of_at]
        new_tw = new_tw.replace(delete_text, '')

    new_tw = new_tw.strip()
    return new_tw

def remove_links(tw): 
    '''
    Removes https links from tweets. 
    '''
    if 'https' not in tw: 
        return tw
    
    new_tw = tw
    while 'https' in new_tw:
        index_of_http = new_tw.find('https')
        search_range = new_tw[index_of_http:]
        for i, char in enumerate(search_range): 
            if i == (len(search_range) - 1): 
                end_index_http = i + 1
                break
            elif char == ' ': 
                end_index_http = i
                break
        
        delete_text = new_tw[index_of_http: end_index_http + index_of_http]   
        new_tw = new_tw.replace(delete_text, '')
    new_tw = new_tw.strip()
    return new_tw

def separate_hashtags(tw): 
    '''
    Finds and seperates hashtags. 
    Improvements: 
       1. cases where no words in the hashtag are capitalized
       2. when hashtag words are in dictionary
    '''
    if '#' not in tw: 
        return tw
    
    new_tw = tw
    while '#' in new_tw:
        index_of_hash = new_tw.find('#')
        search_range = new_tw[index_of_hash:]
        for i, char in enumerate(search_range): 
            if char == ' ': 
                end_index_hash = i
                break 
            elif i == (len(search_range) - 1): 
                end_index_hash = i + 1
                break
        
        new_hash_text = ''
        text_range = new_tw[index_of_hash:end_index_hash + index_of_hash]
        for char in text_range: 
            if char.islower():
                new_hash_text += char
            else: 
                new_hash_text += ' '
                new_hash_text += char.lower()
        
        new_hash_text = new_hash_text.replace('#', '')
        delete_text = new_tw[index_of_hash:end_index_hash + index_of_hash]
        new_tw = new_tw.replace(delete_text, new_hash_text)
    
    new_tw = new_tw.strip()
    return new_tw

def remove_stopwords(tw):
    '''
    Removes stopwords from the tweet text.
    Improvements:
    1. Abbreviations can leave random letters
    2. Some words in the stopwords list may be useful (over, under)
    '''
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tw)
    
    new_tw = []
    for word in word_tokens:
        if word not in stop_words:
            new_tw.append(word)
    
    return(" ".join(new_tw))

In [91]:
#Using the above the functions to perform text preprocessing

tweet_text = mavs_df['text']

cleaned_tws = []
for tweet in tqdm.tqdm(tweet_text): 
    stage_one = remove_ats(tweet)
    stage_two = remove_links(stage_one)
    stage_three = separate_hashtags(stage_two)
    final_tweet = remove_stopwords(stage_three)
    cleaned_tws.append(final_tweet)

100%|██████████| 176123/176123 [01:11<00:00, 2461.44it/s]


Next, using the list output as the text column in a new dataframe.

In [95]:
mavs_df_textprocessed = mavs_df
mavs_df_textprocessed['text'] = cleaned_tws
mavs_df_textprocessed.to_csv('mavs_df_textprocessed.csv')