Natural Language Processing for Data Mining Project

In [None]:
# Libraries needed for Project

# Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing and Feature Engineering
from textblob import TextBlob
import string
import re
from nltk.stem import *
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

Text Processing

In [None]:
# Importing the data using pandas into a dataframe

train_tweets = pd.read_csv('training_tweets.csv')
test_tweets = pd.read_csv('testing_tweets_tsla.csv')

In [None]:
# deleting any null rows that way the dataset is nice and even

train_tweets = train_tweets.dropna()
test_tweets = test_tweets.dropna()

In [None]:
# Creating variables that hold the training tweets and the testing tweets

train_tweets = train_tweets[['label','tweets']]
test = test_tweets['tweets']

In [None]:
# Visualize the ratio of positive and negative labels in the dataset

sns.countplot(x = 'label', data = train_tweets)

Data Preprocessing

In [None]:
# removal of punctuation from tweets

def remove_punctuation(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

In [None]:
# tokenize the tweets

def tokenize_tweets(tweet):
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    temp = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_tokens = ' '.join(temp)
    return clean_tokens

# removing the stop words and returning a clean list of words

def remove_stop_words(tweet):
    temp = tokenize_tweets(tweet)
    no_stop_words = [word for word in temp.split() if word.lower() not in stopwords.words('english')]
    return no_stop_words

In [None]:
# Lemmatizer will break the word down into its root

def lemmatize_tweets(tweet):
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    return normalized_tweet

In [None]:
#This is a function that will preprocess the tweet all at once

def tweet_scrubber(tweet):
    step1 = remove_punctuation(tweet)
    step2 = tokenize_tweets(step1)
    step3 = remove_stop_words(step2)
    step4 = lemmatize_tweets(step3)

    return step4

# Tweet before and after
print(train_tweets['tweets'][218])
tweet_scrubber(train_tweets['tweets'][218])

In [None]:
# this will apply the function to scrub the tweets to make sure they are ready to be fed into ML Pipeline

train_tweets['tweet_list'] = train_tweets['tweets'].apply(tweet_scrubber)
test_tweets['tweet_list'] = test_tweets['tweets'].apply(tweet_scrubber)

# example of how the new preprocessed tweet dataframe would look

pd.set_option('display.max_colwidth', 120)
train_tweets[train_tweets['label']==1].drop('tweets',axis=1).head()

ML Algorithm

In [None]:
# creating variables to feed into the sklearn train_test_split function

X = train_tweets['tweets']
y = train_tweets['label']
test = test_tweets['tweets']

In [None]:
# creating the 80/20 split

msg_train, msg_test, label_train, label_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Machine Learning Pipeline will take preprocessed data and apply the final vectorization to the data

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer = tweet_scrubber)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(msg_train, label_train)

In [None]:
# displaying the accuracy of the model

predictions = pipeline.predict(msg_test)

print(classification_report(predictions,label_test))
print ('\n')
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))