In [38]:
import numpy as np

mvfile = 'movies.csv'
rating_file = 'movieratings.csv'

class Embeddings:

    def __init__(emb, mvfile, rating_file):
        emb.movies = emb.load_movies(mvfile)
        emb.movie_ratings = emb.load_ratings(rating_file)
        emb.cooccurences = emb.make_cooccurence()
        #emb.save_cooccurences("cooccurrence.csv.gz")
        emb.embeddings = emb.train()
        emb.embeddings = emb.train()

    def load_movies(emb, mvfile, delimiter="|"):
        movies = {}
        movie_dict = {}
        with open(mvfile) as f:
            for line in f:
                x = line.split(delimiter)
                movies[int(x[0])] = x[1]
        return movies

    def load_ratings(emb, rating_file, delimiter=","):
        users_dict = {}
        with open(rating_file) as f:
            for line in f:
                x = line.split(delimiter)
                movie_id, user_id, rating = (int(x[0]), int(x[1]), int(x[2]))
                if not rating:
                    continue
                if user_id in users_dict:
                    users_dict[user_id].add(movie_id)
                else:
                    users_dict[user_id] = set([movie_id])
        return users_dict

    def make_cooccurence(emb):
        movies = len(emb.movies.keys())
        cooccurences = np.zeros((movies, movies))
        # the u's are dictionaries every user's movie ratings
        for u in emb.movie_ratings.values():
            for i in u:
                for j in u:
                    cooccurences[i-1,j-1] += 1
        return cooccurences

    def save_cooccurences(emb, file):
        np.savetxt(file, emb.cooccurences)
        return
   
    def train(emb):
        n = emb.cooccurences.shape[0]
        k = 300
        l = 0.00001
        iterations = 200
        v =  0.657304632 * np.random.randn(n, k) # ----- 2 (b) 0.657304632 
        #v = np.zeros((n, k))  # -------- 2 (a)
        for i in range(iterations):
            if i < 11 or i == iterations-1:
                print "Iter: %d, cost: %.2f" % (i, emb.cost(v))
            v = emb.gradient(v, l)
        return v
    

    def gradient(emb, v, l):
        grad = np.dot(v, v.T) - emb.cooccurences
        np.fill_diagonal(grad, 0)
        return v-4*l*np.dot(grad, v)

    def cost(emb, v):
        c = np.square(emb.cooccurences - np.dot(v,v.T))
        np.fill_diagonal(c, 0)
        return np.sum(c)

    def cosine_similarity(emb, v1, v2):
        a = np.linalg.norm(v1)
        b = np.linalg.norm(v2)
        return np.divide(np.dot(v1,v2), a*b)

    def recommend1(emb, movie, r):
        if movie not in emb.movies:
            return []

        scores = []
        v = emb.embeddings[movie-1,:]

        for i, name in emb.movies.items():
            s = emb.cosine_similarity(v, emb.embeddings[i-1,:])
            scores.append((name, s))

        return sorted(scores, key=lambda x: x[1], reverse=True)[:r]

    def recommend2(emb, movies, r):
        n = 0
        v = np.zeros(emb.embeddings.shape[1])
        for m in movies:
            if m not in emb.movies:
                continue

            v += emb.embeddings[m-1,:]
            n += 1

        v = np.divide(v, n)
        scores = []

        for i, name in emb.movies.items():
            s = emb.cosine_similarity(v, emb.embeddings[i-1,:])
            scores.append((name, s))

        return sorted(scores, key=lambda x: x[1], reverse=True)[:r]

if __name__ == "__main__":
    me = Embeddings("movies.csv", "movieratings.csv")

    recommend1 = me.recommend1(95, 10)
    recommend2 = me.recommend2([1, 94], 10)


Iter: 0, cost: 326978321.06
Iter: 1, cost: 302058045.33
Iter: 2, cost: 277306988.12
Iter: 3, cost: 247761898.67
Iter: 4, cost: 207824461.04
Iter: 5, cost: 156870607.87
Iter: 6, cost: 110003234.79
Iter: 7, cost: 86188253.52
Iter: 8, cost: 77226805.34
Iter: 9, cost: 71412900.89
Iter: 10, cost: 66592336.14
Iter: 199, cost: 2241282.01
Iter: 0, cost: 326413225.26
Iter: 1, cost: 301870142.76
Iter: 2, cost: 277593537.22
Iter: 3, cost: 248750621.46
Iter: 4, cost: 209728707.05
Iter: 5, cost: 159319275.27
Iter: 6, cost: 111625287.24
Iter: 7, cost: 86514161.70
Iter: 8, cost: 77221142.73
Iter: 9, cost: 71378396.86
Iter: 10, cost: 66559917.22
Iter: 199, cost: 2234355.61


In [39]:
recommend1

[('Aladdin (1992)\r\n', 1.0),
 ('Beauty and the Beast (1991)\r\n', 0.81897239160501878),
 ('Lion King, The (1994)\r\n', 0.79930836950035722),
 ('Back to the Future (1985)\r\n', 0.76161930093299768),
 ('Jurassic Park (1993)\r\n', 0.76113137158235633),
 ('Apollo 13 (1995)\r\n', 0.74140970993223632),
 ('Toy Story (1995)\r\n', 0.72923166996065103),
 ('Groundhog Day (1993)\r\n', 0.72823190991076003),
 ('Empire Strikes Back, The (1980)\r\n', 0.72362418667676431),
 ('Forrest Gump (1994)\r\n', 0.71336650049064743)]

In [41]:
recommend2

[('Toy Story (1995)\r\n', 0.96016816450032139),
 ('Star Wars (1977)\r\n', 0.83726859206004123),
 ('Return of the Jedi (1983)\r\n', 0.83462954331749939),
 ('Apollo 13 (1995)\r\n', 0.79237546908290224),
 ('Independence Day (ID4) (1996)\r\n', 0.78338197246002272),
 ('Back to the Future (1985)\r\n', 0.77820603187067328),
 ('Men in Black (1997)\r\n', 0.77521234241974779),
 ('Star Trek: First Contact (1996)\r\n', 0.77361270009993488),
 ('Raiders of the Lost Ark (1981)\r\n', 0.7733392717131421),
 ('Empire Strikes Back, The (1980)\r\n', 0.77286425123663283)]

In [None]:
### Question - 2 IDS 576 Assignment 2
# Implementing a Twitter Bot
#%pwd
#%cd

import tweepy
from time import sleep

#reading credentials for app directly from stored credential file
%run '/Users/adityabhandari/Desktop/Spring 2018/IDS 576 - ADV PRED/Assignment/Assignment-2/twitterbot_credentials.py'

# Create variables for each key, secret, token
#consumer_key = 'abcdef'
#consumer_secret = 'abcdef'
#access_token = 'abcdef'
#access_token_secret = 'abcdef'

# Creating an OAuthHandler instance into which we’ll pass our consumer token and secretand integrate with API. 
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

# Write a tweet to push to our Twitter account
tweet = 'Hello, world!'
api.update_status(status=tweet)


In [None]:
# Open text file text_corpus.txt 
my_file = open('text_corpus/text_corpus.txt', 'r')

# Read lines one by one from my_file and assign to file_lines variable
file_lines = my_file.readlines()

# Close file
my_file.close()

# Create a for loop to iterate over file_lines
for line in file_lines:
    try:
        print(line)

    # Add if statement to ensure that blank lines are skipped
        if line != '\n':
            api.update_status(line)

    # Add an else statement with pass to conclude the conditional statement
        else:
            pass
    except:tweepy.TweepError as e:
            print(e.reason)
        
# Add sleep method to space tweets by 5 seconds each
    sleep(360)

In [None]:
from __future__ import print_function
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

max_features = 20000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

In [None]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

In [None]:
print('Building model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print('Training model...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=3,
          validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


In [None]:
# Small LSTM Network to Generate Text alternative approach
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# load ascii text and covert to lowercase
filename = "/Users/adityabhandari/Desktop/Spring 2018/IDS 576 - ADV PRED/Assignment/Assignment-2/text_corpus/text_corpus.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# normalize
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# fit the model
model.fit(X, y, epochs=2, batch_size=128, callbacks=callbacks_list)

In [None]:
# load the network weights
filename = "weights-improvement-19-1.9435.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

# generate characters
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print ("\nDone.")

In [None]:
# Bonus question N-gram repeated steps for question2
import math
import re
import csv
from itertools import zip_longest
from datetime import datetime


def tokenize(input_file, encoding):
    lst =[]
    with open(input_file, 'r', encoding=encoding) as f:
        for sent in f:
            sent = sent.lower()
            sent = re.sub("[A-z0-9\'\"`\|\/\+\#\,\)\(\?\!\-\:\=\;\.\«\»\—\@]", '', sent)
            sent = re.findall('\w+', sent)
            for word in sent:
                lst.append(word)
    return lst


def ngrams_split(lst, n):
    counts = dict()
    grams = [' '.join(lst[i:i+n]) for i in range(len(lst)-n)]
    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1
    return counts


def list_add(counts):
    ngrams = []
    for key, val in counts.items():
        ngrams.append((val, key))
    return ngrams


def gram_add(lst, n):
    ng = []
    grams = [' '.join(lst[i:i+n]) for i in range(len(lst)-n)]
    for gram in grams:
        ng.append(gram)
    return ng


def two_gram_count(input_file, encoding, n_filter, n):
    output_file = []
    lst = tokenize(input_file, encoding) #tokenize
    n_words = len(lst)
    counts = ngrams_split(lst, n) #spliting into ngrams
    ngrams = list_add(counts)  #ading ngrmas to list
    for key, val in ngrams:
        if int(key) >= n_filter:
            ngram_freq = math.log(key/n_words)
            num = key*n_words
            f1 = lst.count(val.split()[0])
            f2 = lst.count(val.split()[1])
            mi = math.pow(math.log(num/(f1*f2), 10), 2)
            ngram_prob = math.log(key/f1, 10)
            output_file.append((ngram_freq, mi, ngram_prob, key, val))
    return output_file


def three_gram_count(input_file, encoding, n_filter, n):
    output_file = []
    lst = tokenize(input_file, encoding) #tokenize
    n_words = len(lst)
    counts = ngrams_split(lst, n) #spliting into ngrams
    ngrams = list_add(counts)  #ading ngrmas to list
    ng = gram_add(lst, 2)
    for key, val in ngrams:
        if int(key) >= n_filter:
            ngram_freq = math.log(key/n_words, 10)
            num = key*n_words
            c2gram = ng.count(val.split()[0] + " " + val.split()[1])
            f1 = lst.count(val.split()[0])
            f2 = lst.count(val.split()[1])
            f3 = lst.count(val.split()[2])
            mi = math.pow(math.log(num/(f1*f2*f3), 10), 2)
            ngram_prob = math.log(key/c2gram, 10)
            output_file.append((ngram_freq, mi, ngram_prob, key, val))
    return output_file


def four_grams_count(input_file, encoding, n_filter, n):
    output_file = []
    lst = tokenize(input_file, encoding) #tokenize
    n_words = len(lst)
    counts = ngrams_split(lst, n) #spliting into ngrams
    ngrams = list_add(counts)  #ading ngrmas to list
    ng2 = gram_add(lst, 2)
    for key, val in ngrams:
        if int(key) >= n_filter:
            ngram_freq = math.log(key/n_words, 10)
            num = key*n_words
            c1gram = ng2.count(val.split()[0] + " " + val.split()[1])
            c2gram = ng2.count(val.split()[1] + " " + val.split()[2])
            c3gram = ng2.count(val.split()[2] + " " + val.split()[3])
            f1 = lst.count(val.split()[0])
            f2 = lst.count(val.split()[1])
            f3 = lst.count(val.split()[2])
            f4 = lst.count(val.split()[3])
            mi = math.pow(math.log(num/(f1*f2*f3*f4), 10), 2)
            prob1 = c1gram/f1
            prob2 = c2gram/f2
            prob3 = c3gram/f3
            ngram_prob = math.log(prob1, 10) + math.log(prob2, 10) +    math.log(prob3, 10)
            output_file.append((ngram_freq, mi, ngram_prob, key, val))
    return output_file


def n_grams_stat(input_file, encoding, n_filter, n):
    output_file = []
    if n == 2:
        for i in two_gram_count(input_file, encoding, n_filter, n):
            output_file.append(i)
    elif n == 3:
        for i in three_gram_count(input_file, encoding, n_filter, n):
            output_file.append(i)
    elif n == 4:
        for i in four_grams_count(input_file, encoding, n_filter, n):
            output_file.append(i)
    return output_file

start_time = datetime.now()
for a, b, c in n_grams_stat("C:/Users/bhandari/Downloads/Adv. Predictive/Assignment 2/text_corpus.txt",'utf-8', n_filter=3, n=4):
    print(a, b, c)
    with open("C:/Users/bhandari/Downloads/Adv. Predictive/Assignment 2/men_4grams.csv", 'dwwaa') as f:
        f.write("Aditya Bhandari")
        f.write(str(a)  +", "+ str(b) + ", "+ str(c) + '\n ')
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
