In [2]:
import ast
import re
import nltk
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\radellng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\radellng\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\radellng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\radellng\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
df = pd.read_csv("./data/combined_twitter_data_with_tweets_corpus.csv")

In [None]:
df_copy.columns

In [4]:
def create_followers_following_ratio(df):
    df['following_to_followers_ratio'] = df['friends_count'] / df['followers_count']
    return df

In [5]:
def has_url_feature(df):
    df['has_url'] = df['url'].apply(lambda x: 0 if pd.isnull(x) else 1)
    return df

#2685 has url, 8427 has no url (is na)

In [6]:
def clean_texts(df):
    def process_tweets_list(corpus):
        
        corpus_processed = []
        for tweet_list in corpus:
            tweet_list = str(tweet_list)
            row_processed = ""
            
            #replace RT and @
            row_processed = tweet_list.replace("RT", "" ) 
            row_processed = row_processed.replace("@", "" )
            
            row_processed = re.sub(r'http\S+', "", row_processed) #remove any URLs in tweets
            row_processed = re.sub(r'[^\x00-\x7f]', "", row_processed) #remove Non-ASCII characters
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed if not row_processed == 'nan' else "") # handle NA
            

        return corpus_processed
    
    def process_description(corpus):
        
        corpus_processed = []
        for row in corpus:
            row = str(row)
            row_processed = re.sub(r'[^\x00-\x7f]', "", row) #remove Non-ASCII characters
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed if not row_processed == 'nan' else "") # handle NA
            
        return corpus_processed
    
    df["tweets_list_processed"] = process_tweets_list(df["tweets_list"])
    df["description_processed"] = process_description(df["description"])
    
    return df
            

In [None]:
clean_texts(df_tweets)

In [7]:
class LemmatizeTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, text):
        return [self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]

In [10]:
def generate_nlp_features(df):
    
    #tweets
    vect_tweets = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )
    
    tweets_processed = pd.Series(df["tweets_list_processed"])
    tfidf_fit_tweets = vect_tweets.fit(tweets_processed)
    tweets_tfidf_array = tfidf_fit_tweets.transform(tweets_processed).toarray()
    tweets_tfidf_df = pd.DataFrame(tweets_tfidf_array)
    tweets_tfidf_df.columns = list(map(lambda x: "tweets_" + str(x), tweets_tfidf_df.columns))
    df = pd.merge(df, tweets_tfidf_df, left_index=True, right_index=True)
    
    #description
    vect_description = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )
    
    description_processed = pd.Series(df["description_processed"])
    tfidf_fit_description = vect_description.fit(description_processed)
    description_tfidf_array = tfidf_fit_description.transform(description_processed).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x: "description_" + str(x), description_tfidf_df.columns))
    df = pd.merge(df, description_tfidf_df, left_index=True, right_index=True)
    
    return (df, tfidf_fit_tweets, tfidf_fit_description)

def nlp_transform_test(df, tfidf_fit_tweets, tfidf_fit_description):
    tweets_tfidf_array = tfidf_fit_tweets.transform(df['tweets_list_processed']).toarray()
    tweets_tfidf_df = pd.DataFrame(tweets_tfidf_array)
    tweets_tfidf_df.columns = list(map(lambda x : "tweets_" + str(x), tweets_tfidf_df.columns))
    df = pd.merge(df, tweets_tfidf_df , left_index=True, right_index=True)
    
    description_tfidf_array = tfidf_fit_description.transform(df['description_processed']).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x : "description_" + str(x), description_tfidf_df.columns))
    df = pd.merge(df, description_tfidf_df , left_index=True, right_index=True)
    
    return df
    
    

In [12]:
result = clean_texts(df)
result, tfidf_fit_tweets, tfidf_fit_description = generate_nlp_features(result)


In [16]:
print(tfidf_fit_description)

TfidfVectorizer(max_features=100, min_df=10, ngram_range=(1, 3),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                tokenizer=<__main__.LemmatizeTokenizer object at 0x0000019129E2F880>)


In [17]:
result = nlp_transform_test(result, tfidf_fit_tweets, tfidf_fit_description)

In [23]:
result.columns[100]

'tweets_61_x'