In [1]:
import ast
import re
import nltk
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\radellng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\radellng\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\radellng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\radellng\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df = pd.read_csv("./data/combined_twitter_data_with_tweets_corpus.csv")

In [3]:

def create_followers_following_ratio(df):
    #followers divide by following (high means popular, low means more following)
    df['following_to_followers_ratio'] = df['friends_count'] / df['followers_count']
    return df

def has_url_feature(df):
    #1 if has url, 0 if no url
    df['has_url'] = df['url'].apply(lambda x: 0 if pd.isnull(x) else 1)
    return df

#2685 has url, 8427 has no url (is na)

In [5]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'name', 'screen_name',
       'statuses_count', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'url', 'lang', 'time_zone',
       'location', 'default_profile', 'default_profile_image', 'geo_enabled',
       'profile_image_url', 'profile_banner_url',
       'profile_use_background_image', 'profile_background_image_url_https',
       'profile_text_color', 'profile_image_url_https',
       'profile_sidebar_border_color', 'profile_background_tile',
       'profile_sidebar_fill_color', 'profile_background_image_url',
       'profile_background_color', 'profile_link_color', 'utc_offset',
       'protected', 'verified', 'description', 'created_at', 'updated',
       'account_type', 'tweets_list'],
      dtype='object')

In [32]:
def name_features(df):
    #get length of username and screen name
    df['username_length'] = df['name'].apply(lambda x: len(str(x)))
    df['screen_name_length'] = df['screen_name'].apply(lambda x: len(str(x)))
    
    #anything that is not a-z or 0-9 will be blocked, outputs length
    df['username_spec_char_count'] = df['name'].apply(lambda x: len(re.findall(r'[^A-Za-z0-9]+', str(x))))
    df['screen_name_spec_char_count'] = df['screen_name'].apply(lambda x: len(re.findall(r'[^A-Za-z0-9]+', str(x))))
    return df

In [31]:
name_features(df)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,...,verified,description,created_at,updated,account_type,tweets_list,username_length,screen_name_length,username_spec_char_count,screen_name_spec_char_count
0,0,1,22903,effeffe,effeffe,164,132,194,12,4,...,,L'uomo ha creato dio a sua immagine e somiglia...,Sun Nov 26 15:19:32 +0000 2006,14/2/2015 11:32,real,['@TheFakeProject cerca followers reali!!! #Im...,7,7,0,0
1,1,3,286543,Alessio Bragadini,abragad,6892,930,535,478,28,...,,Web and social media developer from Italy,Wed Dec 27 14:55:17 +0000 2006,14/2/2015 11:32,real,"[""RT @hotdogsladies: The real problem with ema...",17,7,1,0
2,2,4,438023,fullcaffeine,fullcaffeine,2885,173,444,41,2,...,,,Tue Jan 02 09:01:50 +0000 2007,14/2/2015 11:32,real,['Amare il #giornalismo : @Internazionale @la_...,12,12,0,0
3,3,5,586003,Maurizio Tesconi,myself2048,216,97,234,2,0,...,,CNR Researcher. Web developer. Social media ge...,Fri Jan 05 16:20:42 +0000 2007,14/2/2015 11:32,real,"['RT @TheFakeProject: Dear followers, phase 2 ...",16,10,1,0
4,4,6,628563,Massimo Moretti,MaxMoretti,505,154,314,0,3,...,,The truth is out there!,Fri Jan 12 11:06:17 +0000 2007,14/2/2015 11:32,real,['@Swype when the Italian dictionary will be ...,15,10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11107,11107,16273,385095166,Hull Util Jobs,tmj_UKH_UTIL,2,332,310,0,0,...,,Follow this account for geo-targeted Utilities...,Tue Oct 04 21:25:35 +0000 2011,2016-03-15 13:49:14,fake,,14,12,2,2
11108,11108,16274,385096188,Glasgow Util Jobs,tmj_UKG_UTIL,1,327,308,0,4,...,,Follow this account for geo-targeted Utilities...,Tue Oct 04 21:28:08 +0000 2011,2016-03-15 13:49:14,fake,,17,12,2,2
11109,11109,16275,398274058,Sandwich Uk Mgmt,tmj_SND_mgmt,2,276,265,0,1,...,,Follow this account for geo-targeted Business/...,Tue Oct 25 20:41:55 +0000 2011,2016-03-15 13:49:14,fake,,16,12,2,2
11110,11110,16292,631519728,PR Business/Mgmt,tmj_ptr_mgmt,36,527,278,0,21,...,,Follow this account for geo-targeted Business/...,Mon Jul 09 21:35:55 +0000 2012,2016-03-15 13:49:15,fake,,16,12,2,2


In [6]:
def clean_texts(df):
    def process_tweets_list(corpus):
        
        corpus_processed = []
        for tweet_list in corpus:
            tweet_list = str(tweet_list)
            row_processed = ""
            
            #replace RT and @
            row_processed = tweet_list.replace("RT", "" ) 
            row_processed = row_processed.replace("@", "" )
            
            row_processed = re.sub(r'http\S+', "", row_processed) #remove any URLs in tweets
            row_processed = re.sub(r'[^\x00-\x7f]', "", row_processed) #remove Non-ASCII characters
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed if not row_processed == 'nan' else "") # handle NA
            

        return corpus_processed
    
    def process_description(corpus):
        
        corpus_processed = []
        for row in corpus:
            row = str(row)
            row_processed = re.sub(r'[^\x00-\x7f]', "", row) #remove Non-ASCII characters
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed if not row_processed == 'nan' else "") # handle NA
            
        return corpus_processed
    
    df["tweets_list_processed"] = process_tweets_list(df["tweets_list"])
    df["description_processed"] = process_description(df["description"])
    
    return df
            

In [None]:
clean_texts(df_tweets)

In [7]:
class LemmatizeTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, text):
        return [self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]

In [10]:
def generate_nlp_features(df):
    
    #tweets
    vect_tweets = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )
    
    tweets_processed = pd.Series(df["tweets_list_processed"])
    tfidf_fit_tweets = vect_tweets.fit(tweets_processed)
    tweets_tfidf_array = tfidf_fit_tweets.transform(tweets_processed).toarray()
    tweets_tfidf_df = pd.DataFrame(tweets_tfidf_array)
    tweets_tfidf_df.columns = list(map(lambda x: "tweets_" + str(x), tweets_tfidf_df.columns))
    df = pd.merge(df, tweets_tfidf_df, left_index=True, right_index=True)
    
    #description
    vect_description = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )
    
    description_processed = pd.Series(df["description_processed"])
    tfidf_fit_description = vect_description.fit(description_processed)
    description_tfidf_array = tfidf_fit_description.transform(description_processed).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x: "description_" + str(x), description_tfidf_df.columns))
    df = pd.merge(df, description_tfidf_df, left_index=True, right_index=True)
    
    return (df, tfidf_fit_tweets, tfidf_fit_description)

def nlp_transform_test(df, tfidf_fit_tweets, tfidf_fit_description):
    tweets_tfidf_array = tfidf_fit_tweets.transform(df['tweets_list_processed']).toarray()
    tweets_tfidf_df = pd.DataFrame(tweets_tfidf_array)
    tweets_tfidf_df.columns = list(map(lambda x : "tweets_" + str(x), tweets_tfidf_df.columns))
    df = pd.merge(df, tweets_tfidf_df , left_index=True, right_index=True)
    
    description_tfidf_array = tfidf_fit_description.transform(df['description_processed']).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x : "description_" + str(x), description_tfidf_df.columns))
    df = pd.merge(df, description_tfidf_df , left_index=True, right_index=True)
    
    return df
    
    

In [12]:
result = clean_texts(df)
result, tfidf_fit_tweets, tfidf_fit_description = generate_nlp_features(result)


In [16]:
print(tfidf_fit_description)

TfidfVectorizer(max_features=100, min_df=10, ngram_range=(1, 3),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                tokenizer=<__main__.LemmatizeTokenizer object at 0x0000019129E2F880>)


In [17]:
result = nlp_transform_test(result, tfidf_fit_tweets, tfidf_fit_description)

In [23]:
result.columns[100]

'tweets_61_x'