In [7]:
import pandas as pd
import json
import pickle

from nlp_pipeline import *
from nltk.corpus import stopwords       
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Load data into dataframes

In [8]:
def load_data(type_of_data):
    """
    Read in a JSON file based on whether it is from the users' timeline, favorites, or
    is profile information (timelines, favorites, or profiles).
    ---
    :param type_of_data: String representing the type of data to load.
    :return: Pandas dataframe of the data that was pulled from the Twitter API.
    """

    nums = [20, 100, 234]
    dataframes = []

    for file in [f'{type_of_data}_' + str(num) + '.json' for num in nums]:
        # load in the file as a list of dictionaries
        with open(file) as f:
            data = json.load(f)

        # flatten the data
        flat_data = [item for sublist in data for item in sublist]

        # create dataframe from flattened list
        df = pd.DataFrame(flat_data)

        # append to list
        dataframes.append(df)
        
    full_df = pd.concat(dataframes)
    full_df.reset_index(inplace=True, drop=True)

    return full_df

In [9]:
timelines = load_data('timelines')
favorites = load_data('favorites')
profiles = load_data('profiles')

In [10]:
# export original dataframes to pickle files
timelines.to_pickle('timelines.pkl')
favorites.to_pickle('favorites.pkl')
profiles.to_pickle('profiles.pkl')

## Clean text for both timelines and favorites

In [11]:
favorites.head()

Unnamed: 0,favorited_by_id,id,created_at,screen_name,user_id,in_reply_to_status_id,in_reply_to_screen_name,in_reply_to_user_id,favorite_count,retweet_count,text
0,2649540547,1236533635290890240,Sun Mar 08 06:06:06 +0000 2020,AOC,138203134,1.236524e+18,nbcsnl,28221296.0,115728,3550,@nbcsnl ok this is legendary
1,2649540547,1236523965759262720,Sun Mar 08 05:27:41 +0000 2020,nbcsnl,28221296,,,,866736,169323,𝗙𝗹𝗶𝗽𝗽𝗲𝗱 𝘁𝗵𝗲 𝘀𝘄𝗶𝘁𝗰𝗵. https://t.co/t8qHcGRUhY
2,2649540547,1236510979577524225,Sun Mar 08 04:36:05 +0000 2020,DojaCat,568545739,,,,103837,6546,Please try to be cool when you go to see peopl...
3,2649540547,1236496908891435009,Sun Mar 08 03:40:10 +0000 2020,KenJee_DS,1159830350102781953,,,,13,1,"I just realized that yesterday I broke 100,000..."
4,2649540547,1236075484955410432,Fri Mar 06 23:45:35 +0000 2020,Kwammentary,1094750013304029187,1.236074e+18,jaboukie,319769408.0,101,0,@jaboukie https://t.co/APYUYKSDQU


In [12]:
def clean_text_one(docs):
    """
    Cleans tweet text so that it is in a form suitable for topic modeling.
    ---
    :param docs: Series of documents to be processed. 
    :return: Series of processed texts.
    """

    # remove URLs and hyperlinks
    text_nourl = lambda x: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x)
    
    # remove @ names
    text_noname = lambda x: re.sub('(@[A-Za-z0-9_]+)', '', x)
    
    # remove hashtags
    text_nohash = lambda x: re.sub('(#[A-Za-z0-9_]+)', '', x)
    
    # remove numbers
    text_nonum = lambda x: re.sub(r'\d+', '', x)
    
    # remove the new line character
    text_nonewline = lambda x: re.sub('\n', '', x)
    
    # remove punctuation
    text_nopunct = lambda x: ''.join([char for char in x if char not in string.punctuation])
    
    # convert all letters to lowercase
    text_lower = lambda x: x.lower()
    
    # substitute multiple spaces with single space
    text_nospaces = lambda x: re.sub(r'\s+', ' ', x, flags=re.I)
    
    # remove all single characters
    text_single = lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x)

    # apply all cleaning functions to input text
    for clean_func in [text_nourl, text_noname, text_nohash, text_nonum, text_nonewline,\
        text_nopunct, text_lower, text_nospaces, text_single]:
        docs = docs.map(clean_func)

    return docs

In [13]:
def clean_text_two(docs):
    """
    Cleans tweet text so that it is in a form suitable for topic modeling.
    ---
    :param docs: Series of documents to be processed. 
    :return: Series of processed texts.
    """

    wordNetLemmatizer = WordNetLemmatizer()

    lemmatized_tweets = []
    for text in docs:
        try:
            lemmatized_tweets.append(wordNetLemmatizer.lemmatize(text))
        except:
            pass

    return lemmatized_tweets

<font color='red'>Come back and fix lemmetization</font>

In [14]:
# clean the favorited tweets using all three functions
favorites_cleaned_text = clean_text_one(favorites.text)

In [15]:
favorites_cleaned_text

0                                      ok this is legendary
1                                       𝗙𝗹𝗶𝗽𝗽𝗲𝗱 𝘁𝗵𝗲 𝘀𝘄𝗶𝘁𝗰𝗵 
2         please try to be cool when you go to see peopl...
3         i just realized that yesterday broke view coun...
4                                                          
                                ...                        
547458                  if you pee on it yell mineand it is
547459    pulled up to starbuckssaw barista in training ...
547460    someone please tell me what she has over meill...
547461    there needs to be new yelp feature to help acc...
547462                                                 smh 
Name: text, Length: 547463, dtype: object

In [16]:
# clean the timeline tweets using all three functions
timelines_cleaned_text = clean_text_one(timelines.text)

In [17]:
timelines_cleaned_text

0               understanding power analysis in ab testing 
1           international talent segmentation for startups 
2                                   no thoughts head empty 
3                                                     hi me
4                                                bakabrooks
                                ...                        
615595    dont get there too late or the hipsters will t...
615596    im going just so can bring my ipod and listen ...
615597                                    goldie asap rocky
615598                harlem shake bauer mota flosstradamus
615599                                      cdc dom kennedy
Name: text, Length: 615600, dtype: object

## Add texts back to dataframes and export for modeling

In [18]:
timelines_cleaned = timelines.copy()
timelines_cleaned['text'] = timelines_cleaned_text

In [19]:
favorites_cleaned = favorites.copy()
favorites_cleaned['text'] = favorites_cleaned_text

In [20]:
profiles_cleaned = profiles.copy()

In [21]:
# export cleaned dataframes
favorites_cleaned.to_pickle('favorites_cleaned.pkl')
timelines_cleaned.to_pickle('timelines_cleaned.pkl')
profiles_cleaned.to_pickle('profiles_cleaned.pkl')