# Text Classification

In [2]:
import nltk
from collections import Counter
import pandas as pd
import string
import numpy as np
import sklearn

## 1. Text Processing 

In [3]:
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """ Normalizes case and handles punctuation
    Inputs:
        text: str: raw text
        lemmatizer: an instance of a class implementing the lemmatize() method
                    (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
    Outputs:
        list(str): tokenized text
    """
    text = text.lower()
    text = (text.replace("'s", "")).replace("'","")
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(translator)    
    tokens = nltk.word_tokenize(str(text))

    token_list = []
    for word in tokens:
        lemma = lemmatizer.lemmatize(word)
        token_list.append(lemma)
        
    return token_list


In [14]:
text = "This is a sample test input for processing."
print(process(text))

['this', 'is', 'a', 'sample', 'test', 'input', 'for', 'processing']


In [15]:

tweets = pd.read_csv("tweets_train.csv", na_filter=False)
print(tweets.head())


      screen_name                                               text
0             GOP  RT @GOPconvention: #Oregon votes today. That m...
1    TheDemocrats  RT @DWStweets: The choice for 2016 is clear: W...
2  HillaryClinton  Trump's calling for trillion dollar tax cuts f...
3  HillaryClinton  .@TimKaine's guiding principle: the belief tha...
4        timkaine  Glad the Senate could pass a #THUD / MilCon / ...


In [16]:
def process_all(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """ process all text in the dataframe using process_text() function.
    Inputs
        df: pd.DataFrame: dataframe containing a column 'text' loaded from the CSV file
        lemmatizer: an instance of a class implementing the lemmatize() method
                    (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
    Outputs
        pd.DataFrame: dataframe in which the values of text column have been changed from str to list(str),
                        the output from process_text() function. Other columns are unaffected.
    """
    result_df = df.copy(deep=True)
    counter = 0
    for thisText in result_df.text:
        result_df.text[counter] = process(thisText)
        counter += 1
        
    return result_df


processed_tweets = process_all(tweets)
print(processed_tweets.head())


      screen_name                                               text
0             GOP  [rt, gopconvention, oregon, vote, today, that,...
1    TheDemocrats  [rt, dwstweets, the, choice, for, 2016, is, cl...
2  HillaryClinton  [trump, calling, for, trillion, dollar, tax, c...
3  HillaryClinton  [timkaine, guiding, principle, the, belief, th...
4        timkaine  [glad, the, senate, could, pas, a, thud, milco...


## 2. Feature Construction 

In [17]:
def get_rare_words(processed_tweets):
    """ use the word count information across all tweets in training data to come up with a feature list
    Inputs:
        processed_tweets: pd.DataFrame: the output of process_all() function
    Outputs:
        list(str): list of rare words, sorted alphabetically.
    """
    list_all_words = []
    for thisText in processed_tweets.text:
        list_all_words.extend(thisText)
    word_dict_ctr = dict(Counter(list_all_words))
    rare_word_list = [word for word in word_dict_ctr if word_dict_ctr[word] == 1]
    rare_word_list.sort()

    return rare_word_list
    
print(processed_tweets.head())
rare_words = get_rare_words(processed_tweets)
print(len(rare_words)) 


      screen_name                                               text
0             GOP  [rt, gopconvention, oregon, vote, today, that,...
1    TheDemocrats  [rt, dwstweets, the, choice, for, 2016, is, cl...
2  HillaryClinton  [trump, calling, for, trillion, dollar, tax, c...
3  HillaryClinton  [timkaine, guiding, principle, the, belief, th...
4        timkaine  [glad, the, senate, could, pas, a, thud, milco...
21280


Construct a sparse matrix of features for each tweet with the help of `sklearn.feature_extraction.text.TfidfVectorizer`. Remember to ignore the rare words obtained above and NLTK's stop words during the feature creation step. We must leave other optional parameters (e.g., `vocab`, `norm`, etc) at their default values.

In [18]:
def create_features(processed_tweets, rare_words):
    """ creates the feature matrix using the processed tweet text
    Inputs:
        tweets: pd.DataFrame: tweets read from train/test csv file, containing the column 'text'
        rare_words: list(str): one of the outputs of get_feature_and_rare_words() function
    Outputs:
        sklearn.feature_extraction.text.TfidfVectorizer: the TfidfVectorizer object used
                                                we need this to tranform test tweets in the same way as train tweets
        scipy.sparse.csr.csr_matrix: sparse bag-of-words TF-IDF feature matrix
    """
    
    stop_words = nltk.corpus.stopwords.words('english')
    stop_words.extend(rare_words)
    vect = sklearn.feature_extraction.text.TfidfVectorizer(stop_words = stop_words)
    tweet_array = [" ".join(thistext) for thistext in processed_tweets.text]

    bow_matrix = vect.fit_transform(tweet_array)
    
    return(vect, bow_matrix)

(tfidf, X) = create_features(processed_tweets, rare_words)
print(tfidf)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',...🇺🇸🇺🇸🇺🇸', '🚙clean', '🚨🚨', '🚪', '🚪close', '🚫choice', '🚫climate', '🚫obamacare', '🚫👷', '🚴', '🤑', '🤔🙄🙅🏼'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)


Also for each tweet, assign a class label (0 or 1) using its `screen_name`. Use 0 for realDonaldTrump, mike_pence, GOP and 1 for the rest.

In [19]:
def create_labels(processed_tweets):
    """ creates the class labels from screen_name
    Inputs:
        tweets: pd.DataFrame: tweets read from train file, containing the column 'screen_name'
    Outputs:
        numpy.ndarray(int): dense binary numpy array of class labels
    """
    
    return np.array([0 if name in ("realDonaldTrump", "mike_pence", "GOP") else 1 for name in processed_tweets.screen_name], dtype=int)
    
y = create_labels(processed_tweets)
print(y)
print(len([k for k in y if k == 1]))


[0 1 1 ..., 0 1 0]
8652


## 3. Classification 

In [20]:
def learn_classifier(X_train, y_train, kernel='best'):
    """ learns a classifier from the input features and labels using the kernel function supplied
    Inputs:
        X_train: scipy.sparse.csr.csr_matrix: sparse matrix of features, output of create_features_and_labels()
        y_train: numpy.ndarray(int): dense binary vector of class labels, output of create_features_and_labels()
        kernel: str: kernel function to be used with classifier. [best|linear|poly|rbf|sigmoid]
                    if 'best' is supplied, reset the kernel parameter to the value you have determined to be the best
    Outputs:
        sklearn.svm.classes.SVC: classifier learnt from data
    """
    if kernel == 'best':
        kernel = 'linear' # entered after testing which is the best kernel fuction
        
    svm_classifier = sklearn.svm.SVC(kernel = kernel)
    svm_classifier.fit(X_train,y_train)

    return svm_classifier

classifier = learn_classifier(X, y, 'linear')
print(classifier)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [21]:
def evaluate_classifier(classifier, X_validation, y_validation):
    """ evaluates a classifier based on a supplied validation data
    Inputs:
        classifier: sklearn.svm.classes.SVC: classifer to evaluate
        X_train: scipy.sparse.csr.csr_matrix: sparse matrix of features
        y_train: numpy.ndarray(int): dense binary vector of class labels
    Outputs:
        double: accuracy of classifier on the validation data
    """
    y_predict = classifier.predict(X_validation)
    return np.sum(y_predict == y_validation)/len(y_validation)
    
accuracy = evaluate_classifier(classifier, X, y)
print(accuracy)


0.956700196555


In [283]:
# AUTOLAB_IGNORE_START
for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
    classifier = learn_classifier(X, y, kernel)
    accuracy = evaluate_classifier(classifier, X, y)
    print(kernel,':',accuracy)
# AUTOLAB_IGNORE_STOP

linear : 0.956700196555
rbf : 0.500173430454
poly : 0.500173430454
sigmoid : 0.500173430454


In [41]:
def classify_tweets(tfidf, classifier, unlabeled_tweets):
    """ predicts class labels for raw tweet text
    Inputs:
        tfidf: sklearn.feature_extraction.text.TfidfVectorizer: the TfidfVectorizer object used on training data
        classifier: sklearn.svm.classes.SVC: classifier learnt
        unlabeled_tweets: pd.DataFrame: tweets read from tweets_test.csv
    Outputs:
        numpy.ndarray(int): dense binary vector of class labels for unlabeled tweets
    """
    
    unlabelled_processed = process_all(unlabeled_tweets, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer())
    tweet_array = []
    for i in range(len(unlabelled_processed)):
        tweet_array.append(" ".join(thisT for thisT in unlabelled_processed.text[i]))

    transform_unl_text = tfidf.transform(tweet_array)
    predict = classifier.predict(transform_unl_text)
    return predict
    

classifier = learn_classifier(X, y, 'best')
unlabeled_tweets = pd.read_csv("tweets_test.csv", na_filter=False)
y_pred = classify_tweets(tfidf, classifier, unlabeled_tweets)
