# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [94]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [95]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [96]:
test = pd.read_csv('corporate_messaging.csv', encoding='latin-1')

In [97]:
test.tail(2)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,category,category:confidence,category_gold,id,screenname,text
3116,677278316,True,golden,8,,Information,0.8973,Information,3.95962e+17,DanoneGroup,"Yo-Jelly, Danone new brand in South Africa : t..."
3117,677278317,True,golden,5,,Information,1.0,Information,3.93732e+17,Nestle,Z Bhutta: Problems with food&amp;land systems ...


In [98]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)  #list per case
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [99]:
# load data
X, y = load_data()

# perform train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [100]:
X

array([ 'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG',
       'Barclays announces result of Rights Issue http://t.co/LbIqqh3wwG',
       'Barclays publishes its prospectus for its å£5.8bn Rights Issue: http://t.co/YZk24iE8G6',
       ...,
       'WeÌ¢\x89âÂ\x89ã¢re working hard to do all we can to promote healthier lifestyles and diets for kids http://t.co/hw8oIHYMaI',
       'Yesterday, these #HealthyKids lit up Broadway with #Nestle, @iaaforg and some sporting stars: http://t.co/YdtBj60Ofz',
       'Z Bhutta: Problems with food&amp;land systems include land acquistion, commodity speculation affecting food prices&amp;lack of discussion #NINS2013'], dtype=object)

In [101]:
import numpy as np
np.unique(y)

array(['Action', 'Dialogue', 'Information'], dtype=object)

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [102]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
#countvectorizer
# tokenizer : callable or None (default)
# Override the string tokenization step while preserving the preprocessing and n-grams generation steps.

# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
#clf = LogisticRegression()   #predict 0.89 test
clf = RandomForestClassifier()   # predict 0.92 test
#clf = AdaBoostClassifier()  #0.73 test

# Fit and/or transform each to the data

X_train_vec = vect.fit_transform(X_train)
#X_train.toarray()
X_train_tfidf = tfidf.fit_transform(X_train_vec)
#X_train.toarray()

clf.fit(X_train_tfidf,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [103]:
y_train_predict = clf.predict(X_train_tfidf)

In [104]:
y_train_predict

array(['Action', 'Information', 'Information', ..., 'Information',
       'Action', 'Information'], dtype=object)

In [105]:
y_train

array(['Action', 'Information', 'Information', ..., 'Information',
       'Action', 'Information'], dtype=object)

In [106]:
from sklearn.metrics import accuracy_score,confusion_matrix
acc = accuracy_score(y_train, y_train_predict)
cm = confusion_matrix(y_train, y_train_predict)                     

print(acc)
print('\n')
print(cm)

0.996273291925


[[ 316    0    3]
 [   0   72    0]
 [   3    0 1216]]


### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [107]:
# Transform test data

X_test_vec = vect.transform(X_test)
#X_test.toarray()
X_test_tfidf = tfidf.transform(X_test_vec)
#X_test.toarray()


# Predict test labels
y_test_predict = clf.predict(X_test_tfidf)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [108]:
labels = np.unique(y_test)
confusion_mat = confusion_matrix(y_test, y_test_predict,labels=labels)
accuracy = accuracy_score(y_test, y_test_predict)

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[103   1  33]
 [  3  39  10]
 [ 15   1 588]]
Accuracy: 0.920554854981


In [112]:
X_test_vec.toarray()

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 2, 0, ..., 0, 0, 0],
       ..., 
       [0, 1, 0, ..., 0, 0, 0],
       [0, 2, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Final Step: Refactor
Organize these steps into the following functions.

In [115]:
def display_results(y_true,y_predict,text):
    # insert step 4 here
    labels = np.unique(y_true)
    confusion_mat = confusion_matrix(y_true,y_predict,labels=labels)
    accuracy = accuracy_score(y_true,y_predict)

    print(text)
    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    print('')


def main():

    # load split
    X, y = load_data()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    #initialize

    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()   # predict 0.92 test

    # Fit and/or transform each to the data

    X_train_vec = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_vec)
    clf.fit(X_train_tfidf,y_train)
    
    y_train_predict = clf.predict(X_train_tfidf)  # no transform predict is transfrom
    display_results(y_train,y_train_predict,'train')    
    
    X_test_vec = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_vec)

    # Predict test labels
    y_test_predict = clf.predict(X_test_tfidf)
    display_results(y_test,y_test_predict,'test')

In [116]:
# run program
main()

train
Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 316    0    3]
 [   0   71    1]
 [   1    0 1218]]
Accuracy: 0.996894409938

test
Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[104   0  33]
 [  3  41   8]
 [ 17   2 585]]
Accuracy: 0.920554854981

