In [5]:
import csv
import re
import sys
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.special import entr
from scipy.stats import norm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords

In [6]:
# Needed functions for processing the data.

# Function to clean text data.
def clean_review(text):
    # Strip HTML tags
    text = re.sub('<[^<]+?>', ' ', text)
 
    # Strip escaped quotes
    text = text.replace('\\"', '')
 
    # Strip quotes
    text = text.replace('"', '')
    
    # Strip @
    text = text.replace('@', '')
 
    return text

# Puts words into a onehot embedding for the ML models ... I think.
def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes

# Shuffle the data function.
def shuffle(X, y):
    perm = np.random.permutation(len(X))
    X = X[perm]
    y = y[perm]
    return X, y

In [141]:
# Data processing for news dataset.
# News cell 1/4

# This is the data set that throws the error below.
# 'utf-8' codec can't decode byte 0x96 in position 37: invalid start byte
# need the latin-1 encoding argument in the pd.read_csv line.
# The allImdb and tweets dataset seem not to need any modification.

path = '/Users/AJApple/Downloads/'
data_news = []
with open(path + 'world_news_in_month_cleaned_2_columsw.csv', 'rt', encoding='latin-1', newline='') as src:
    reader_news = csv.reader(src, dialect='excel', lineterminator='\n')
    for row in reader_news:
        data_news.extend(row)

# Start index at 2 to skip over labels.
data_sentence_news = []
data_sentiment_news = []

index = 0
for i in range(0, len(data_news)):
    if (i % 2) == 0:
        data_sentence_news.append(data_news[index])
    else:
        data_sentiment_news.append([data_news[index]])
    index += 1

news_sentiment = data_sentiment_news
news_text = []
for row in range(0, len(data_sentence_news)):
    news_text.append(clean_review(data_sentence_news[row]))
    
# From this cell there are two out put vectors one for the input text called:
#     news_text
# and one for the sentiment i.e. label/output called:
#     news_sentiment

In [149]:
# Test form of the data.

a = 52428  # check with random index values
print(data_sentence_news[a])
print(data_sentiment_news[a])
print(news_text[a])
print(news_sentiment[a])

US health regulators have approved the first new type of flu drug in two decades. Wednesday's approval of Xofluza for people ages 12 and older comes ahead ...
['1']
US health regulators have approved the first new type of flu drug in two decades. Wednesday's approval of Xofluza for people ages 12 and older comes ahead ...
['1']


In [None]:
# News Cell 2/4

# Analysis of news (of cleaned data).
global_sentence_news = []
unique_word_count_news = 0
word_count_news = 0
sentence_lengths_array_news = []

for i in range(0, len(data_sentence_news)):
    sentence = list(data_sentence_news[i].split())
    word_count_news += len(sentence)
    global_sentence_news.extend(sentence)
    sentence_lengths_array_news.extend([len(sentence)])

# Some dataset statistics.

total_sentences_news = len(data_sentence_news)
print('Total number of sentences = ', total_sentences_news)

# To get unique word count use set theory.
unique_word_count_news = len(set(global_sentence_news))
print('Unique word count news = ', unique_word_count_news)

# To get total counts.
print('Total word count = ', word_count_news)

# Some ratios calculated from the data.
print('Unique to total word count of news metric = ', unique_word_count_news / word_count_news)
print('Unique word to number of sentences news metric = ', unique_word_count_news / total_sentences_news)

# Fit a normal distribution to the data:
mu_news, std_news = norm.fit(sentence_lengths_array_news)

In [None]:
# News Cell 3/4

# Plot the histogram
plt.hist(sentence_lengths_array_news, 50, normed=True, color='r')

# Plot the PDF.
# Note there are two distributions fit to the data. The first is a Gaussian and
# the second in non-parametric formed by overlapping kernels.

# xmin, xmax = plt.xlim()
# x = np.linspace(xmin, xmax, 100)
# y = norm.pdf(x, mu_news, std_news)
# plt.plot(x, y)
plt.xlabel('Length of news samples')
plt.ylabel('Number of news samples')
plt.title('Sample Length distribution with a fitted distribution')
plt.show()
sns.distplot([sentence_lengths_array_news])  # seaborn plot

In [None]:
# News Cell 4/4

X_train_news, X_test_news, y_train_news, y_test_news = train_test_split(data_sentence_news, data_sentiment_news, test_size=0.2)

X_train_sequences_news = [to_sequence(tokenize, preprocess, word2idx, x) for x in X_train_news]

MAX_SEQ_LENGHT_news = len(max(X_train_sequences_news, key=len))
print("MAX_SEQ_LENGHT_news=", MAX_SEQ_LENGHT_news)
 
X_train_sequences_news = pad_sequences(X_train_sequences_news, maxlen=MAX_SEQ_LENGHT_allImdb, value=N_FEATURES)
X_test_sequences_news = [to_sequence(tokenize, preprocess, word2idx, x) for x in X_test_news]
X_test_sequences_news = pad_sequences(X_test_sequences_news, maxlen=MAX_SEQ_LENGHT_allImdb, value=N_FEATURES)

# Need to turn string values into integers for the model.
y_train_news = [int(i) for i in y_train_news]
y_test_news = [int(i) for i in y_test_news]

In [111]:
# Data processing for allImdb dataset.
# allImdb Cell 1/4

path = '/Users/AJApple/Downloads/'
data_allImdb = []
reader_allImdb = csv.reader(open(path + 'IMDB_dataset_2.csv'), delimiter=',')
for row in reader_allImdb:
    data_allImdb.extend(row)

# Create empty lists to hold the sentences and sentiment data.
# These lists contain all the data. The data will still need to get shuffled and split
# into traing and test.
data_sentence_allImdb = []
data_sentiment_allImdb = []

index = 0
for i in range(0, len(data_allImdb)):
    if (i % 2) == 0:
        data_sentence_allImdb.append(data_allImdb[index])
    else:
        data_sentiment_allImdb.append([data_allImdb[index]])
    index += 1

imdb_sentiment = []
for i in range(0, len(data_sentiment_allImdb)):
    if data_sentiment_allImdb[i] == ['positive']:
        imdb_sentiment.append(1)
    else:
        imdb_sentiment.append(0)

imdb_text = []
# Clean the sentence data i.e. removing html tags, ... etc.
for row in range(0, len(data_sentence_allImdb)):
    imdb_text.append(clean_review(data_sentence_allImdb[row]))
    
# From this cell there are two out put vectors one for the input text called:
#     imdb_text
# and one for the sentiment i.e. label/output called:
#     imdb_sentiment

In [125]:
# Sanity check cell to make sure data is correct. This should be checked with the 
# origional csv file data. (It checks out.)

a = 49989  # check with random index values
print(data_sentence_allImdb[a])
print(data_sentiment_allImdb[a])
print(imdb_text[a])
print(imdb_sentiment[a])

I got this one a few weeks ago and love it! It's modern, light but filled with true complexities of life. It questions and answers, just like other Eytan Fox movies. This is my favorite, along with Jossi & Jagger. This pictures a lot more, universally, than only the bubbles we may live in. You don't need to be Jewish or homosexual to enjoy this - I'm not, but the movie goes directly to my top ten movies. At first it seems like pure entertainment but it does make you think further. Relationships we have to live with are superficial, meaningful, deep, fatal, you name it. You don't know what's coming, and you definitely don't know where this story is heading as you watch it the first time. It is worth seeing several times. Fox movies include great bonus material - here a great music video and "the making of" (including explanation of the title, interviewing Lior Ashknenazi who plays himself in the movie and Arabs with doubts about the Israeli life styles).
['positive']
I got this one a fe

In [217]:
# To get data use csv reader cause pandas not working properly.
# tweets Cell 1/4

path = '/Users/AJApple/Downloads/tweet-sentiment-extraction/'
data_tweets = []
reader = csv.reader(open(path + 'train_2_columns.csv'), delimiter=',')
for row in reader:
    data_tweets.extend(row)

data_sentence_tweets = []
data_sentiment_tweets = []

index = 0
for i in range(0, len(data_tweets)):
    if (i % 2) == 0:
        data_sentence_tweets.append(data_tweets[index])
    else:
        data_sentiment_tweets.append([data_tweets[index]])
    index += 1

tweets_sentiment = []
for i in range(0, len(data_sentiment_tweets)):
    if data_sentiment_tweets[i] == ['positive']:
        tweets_sentiment.append(1)
    elif data_sentiment_tweets[i] == ['negative']:
        tweets_sentiment.append(0)
    else:
        tweets_sentiment.append(2)
        
tweets_text = []
for row in range(0, len(data_sentence_tweets)):
    tweets_text.append(clean_review(data_sentence_tweets[row]))
    

# Now need to remove all the neutral elements

index = 0
while True:
    if tweets_sentiment[index] == 2:
        tweets_text.remove(tweets_text[index])
        tweets_sentiment.remove(tweets_sentiment[index])
        index -= 1
    index += 1
    if index == len(tweets_text):
        break
        
# From this cell there are two out put vectors one for the input text called:
#     tweets_text
# and one for the sentiment i.e. label/output called:
#     tweets_sentiment

In [218]:
# Sanity check cell to make sure data is correct. This should be checked with the 
# origional csv file data. (It checks out.)

a = -1
print(tweets_text[a])
print(tweets_sentiment[a])

 But it was worth it  ****.
1


In [220]:
cd /Users/AJApple/Dropbox/machine-learning-fiu/ml_fiu

/Users/AJApple/Dropbox/machine-learning-fiu/ml_fiu


In [221]:
pwd


'/Users/AJApple/Dropbox/machine-learning-fiu/ml_fiu'