In [1]:
import pandas as pd
import json
import pickle
import re
import string

from nltk.corpus import stopwords       
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Load data into dataframes

In [2]:
def load_tweets(type_of_data):
    """
    Read in a JSON file based on whether it is from the users' timeline, favorites, or
    is profile information (timelines, favorites, or profiles).
    ---
    :param type_of_data: String representing the type of data to load.
    :return: Pandas dataframe of the data that was pulled from the Twitter API.
    """

    nums = [20, 100, 234]
    dataframes = []

    for file in [f'{type_of_data}_' + str(num) + '.json' for num in nums]:
        # load in the file as a list of dictionaries
        with open(file) as f:
            data = json.load(f)

        # flatten the data
        flat_data = [item for sublist in data for item in sublist]

        # create dataframe from flattened list
        df = pd.DataFrame(flat_data)

        # append to list
        dataframes.append(df)
        
    full_df = pd.concat(dataframes)
    full_df.reset_index(inplace=True, drop=True)

    return full_df

In [3]:
timelines = load_tweets('timelines')
favorites = load_tweets('favorites')

FileNotFoundError: [Errno 2] No such file or directory: 'timelines_20.json'

In [None]:
def load_profiles(type_of_data):
    """
    Read in a JSON file based on whether it is from the users' timeline, favorites, or
    is profile information (timelines, favorites, or profiles).
    ---
    :param type_of_data: String representing the type of data to load.
    :return: Pandas dataframe of the data that was pulled from the Twitter API.
    """

    nums = [20, 100, 234]
    dataframes = []

    for file_name in [f'{type_of_data}_' + str(num) + '.json' for num in nums]:
        # create dataframe from json file
        df = pd.read_json(file_name)

        # append to list
        dataframes.append(df)
        
    full_df = pd.concat(dataframes)
    full_df.reset_index(inplace=True, drop=True)

    return full_df

In [None]:
profiles = load_profiles('profiles')

In [None]:
# export original dataframes to pickle files
timelines.to_pickle('timelines.pkl')
favorites.to_pickle('favorites.pkl')
profiles.to_pickle('profiles.pkl')

## Clean text for both timelines and favorites

In [None]:
favorites.head()

In [None]:
def clean_text_one(docs):
    """
    Cleans tweet text so that it is in a form suitable for topic modeling.
    ---
    :param docs: Series of documents to be processed. 
    :return: Series of processed texts.
    """

    # remove URLs and hyperlinks
    text_nourl = lambda x: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x)
    
    # remove @ names
    text_noname = lambda x: re.sub('(@[A-Za-z0-9_]+)', '', x)
    
    # remove hashtags
    text_nohash = lambda x: re.sub('(#[A-Za-z0-9_]+)', '', x)
    
    # remove numbers
    text_nonum = lambda x: re.sub(r'\d+', '', x)
    
    # remove the new line character
    text_nonewline = lambda x: re.sub('\n', '', x)
    
    # remove punctuation
    text_nopunct = lambda x: ''.join([char for char in x if char not in string.punctuation])
    
    # convert all letters to lowercase
    text_lower = lambda x: x.lower()
    
    # substitute multiple spaces with single space
    text_nospaces = lambda x: re.sub(r'\s+', ' ', x, flags=re.I)
    
    # remove all single characters
    text_single = lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x)

    # apply all cleaning functions to input text
    for clean_func in [text_nourl, text_noname, text_nohash, text_nonum, text_nonewline,\
        text_nopunct, text_lower, text_nospaces, text_single]:
        docs = docs.map(clean_func)

    return docs

In [None]:
def clean_text_two(docs):
    """
    Cleans tweet text so that it is in a form suitable for topic modeling.
    ---
    :param docs: Series of documents to be processed. 
    :return: Series of processed texts.
    """

    wordNetLemmatizer = WordNetLemmatizer()

    lemmatized_tweets = []
    for text in docs:
        try:
            lemmatized_tweets.append(wordNetLemmatizer.lemmatize(text))
        except:
            lemmatized_tweets.append(text)

    return lemmatized_tweets

In [None]:
# clean the favorited tweets using first function
favorites_cleaned_text = clean_text_one(favorites.text)
favorites_cleaned_text

In [None]:
# lemmatize the favorited tweets with the second function
favorites_cleaned_text = clean_text_two(favorites_cleaned_text)
favorites_cleaned_text

In [None]:
# clean the timeline tweets using first function
timelines_cleaned_text = clean_text_one(timelines.text)
timelines_cleaned_text

In [None]:
timelines_cleaned_text = clean_text_two(timelines_cleaned_text)
timelines_cleaned_text

## Add texts back to dataframes and export for modeling

In [None]:
timelines_cleaned = timelines.copy()
timelines_cleaned['text'] = timelines_cleaned_text

In [None]:
favorites_cleaned = favorites.copy()
favorites_cleaned['text'] = favorites_cleaned_text

In [None]:
profiles_cleaned = profiles.copy()

In [None]:
# export cleaned dataframes
favorites_cleaned.to_pickle('favorites_cleaned_lemma.pkl')
timelines_cleaned.to_pickle('timelines_cleaned_lemma.pkl')
profiles_cleaned.to_pickle('profiles_cleaned_lemma.pkl')