In [13]:
import pandas as pd
import numpy as np
import os
import glob
import torch
import torch.nn as nn
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
import re
import string
from nltk.corpus import stopwords

In [14]:
for punc in string.punctuation:
    print(punc)

!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
:
;
<
=
>
?
@
[
\
]
^
_
`
{
|
}
~


In [15]:
vocab_path='imdb.vocab'

def make_vocab(path):
    # Reading imdb vocab file
    with open(path,encoding="utf8") as file:
        words_list=file.readlines()
    
    # Splitting /n and "" from the words
    words_list=[s.split('\n')[0].split('""')[0] for s in words_list]
    # Generating a list of lists containing the vocab word
    words_list=[[word] for word in words_list]
    return words_list

# A list to hold the imdb vocab
words_list=make_vocab(vocab_path)

In [16]:
words_list

[['the'],
 ['and'],
 ['a'],
 ['of'],
 ['to'],
 ['is'],
 ['it'],
 ['in'],
 ['i'],
 ['this'],
 ['that'],
 ['was'],
 ['as'],
 ['for'],
 ['with'],
 ['movie'],
 ['but'],
 ['film'],
 ['on'],
 ['not'],
 ['you'],
 ['he'],
 ['are'],
 ['his'],
 ['have'],
 ['be'],
 ['one'],
 ['!'],
 ['all'],
 ['at'],
 ['by'],
 ['an'],
 ['who'],
 ['they'],
 ['from'],
 ['so'],
 ['like'],
 ['there'],
 ['her'],
 ['or'],
 ['just'],
 ['about'],
 ['if'],
 ['has'],
 ['out'],
 ['what'],
 ['?'],
 ['some'],
 ['good'],
 ['more'],
 ['when'],
 ['she'],
 ['very'],
 ['even'],
 ['my'],
 ['no'],
 ['up'],
 ['time'],
 ['would'],
 ['which'],
 ['only'],
 ['story'],
 ['really'],
 ['their'],
 ['see'],
 ['had'],
 ['can'],
 ['were'],
 ['me'],
 ['we'],
 ['than'],
 ['well'],
 ['much'],
 ['been'],
 ['get'],
 ['people'],
 ['will'],
 ['bad'],
 ['other'],
 ['also'],
 ['into'],
 ['do'],
 ['because'],
 ['great'],
 ['first'],
 ['how'],
 ['him'],
 ['most'],
 ["don't"],
 ['its'],
 ['made'],
 ['then'],
 ['them'],
 ['way'],
 ['make'],
 ['could'],
 ['t

In [17]:
def read_files(path):
    """
    Run this script from the root directory
    """
    for file in glob.glob(f"{path}\*.txt"):
        yield file
        
def clean_query(query):
#     return query.translate(query.maketrans('','',string.punctuation)).replace('br','')
    for punc in string.punctuation:
        if punc == "'":
            query=query.replace(punc,'')
        else:
            query=query.replace(punc,' ')
    return query.replace('br',' ')

def load_reviews(path):
    reviews = []
    for file_name in read_files(path):
        with open(file_name, encoding='UTF-8') as file:
            reviews.append([clean_query(file.read())])
    return reviews
    

def load_positive_reviews():
    path = "train\pos"
    return load_reviews(path)

def load_negative_reviews():
    path = "train\neg"
    return load_reviews(path)
    
def create_word_counter():
    reviews_list = load_positive_reviews()
    word_counter = {}
    for review in reviews_list:
        review_text = review[0]
        for word in review_text.split():
            if word.lower() in word_counter:
                word_counter[word.lower()] +=1
            else:
                word_counter[word.lower()] =1
    # Remove words with less than 5 appearances
    words_to_remove = set()
    clean_word_counter = {}
    for word, count in word_counter.items():
        if count < 5: 
            words_to_remove.add(word)
        else:
            clean_word_counter[word] = count

    reviews_without_uncommon_words = []
    for review in reviews_list: 
        review_text = review[0]
        text_without_uncommon_words = []
        for word in review_text.split():
            if word not in words_to_remove:
                text_without_uncommon_words.append(word)
        reviews_without_uncommon_words.append([" ".join(text_without_uncommon_words)])
        
    return clean_word_counter, reviews_without_uncommon_words

wc, clean_reviews = create_word_counter()

print(clean_reviews[0])
print(wc)                
    

['Bromwell High is a cartoon comedy It ran at the same time as some other programs about school life such as Teachers My 35 years in the teaching profession lead me to believe that Bromwell Highs satire is much closer to reality than is Teachers The to survive financially the insightful students who can see right through their pathetic teachers pomp the of the whole situation all remind me of the schools I knew and their students When I saw the episode in which a student repeatedly tried to burn down the school I immediately recalled at High A classic line INSPECTOR Im here to sack one of your teachers STUDENT Welcome to Bromwell High I expect that many adults of my age think that Bromwell High is far fetched What a pity that it isnt']


In [18]:
def tokenize(query):
    return torch.LongTensor([model.wv.get_index(token) for token in query.split()])

def v2cembedding(query,model):
    weights = torch.FloatTensor(model.wv.vectors)
    embedding = nn.Embedding.from_pretrained(weights)

    indx=tokenize(query)
    return embedding(indx)

def lower(query):
    return " ".join(list(map(lambda x:x.lower(),query.split())))

def clean_query(query):
#     return query.translate(query.maketrans('','',string.punctuation)).replace('br','')
    for punc in string.punctuation:
        query=query.replace(punc,' ')
    return query.replace('br',' ')

# Training a new w2v embedding based on imdb vocab
modelw2v=Word2Vec(sentences=words_list
                  ,vector_size=100
                  ,min_count=1, window=5, workers=4)

v2cembedding(lower(clean_query("I basically skimmed through the movie but just enough to catch watch the plot was about. To tell you the truth it was kind of boring to me and at some spots it didn't make sense. The only reason I watched this movie in the first place was to see CHACE CRAWFORD!!! He is so hot, but in this movie his hair was kind of weird. But still hot.<br /><br />However, despite how hot CHACE is, it really did not make up for the film. I guess the plot isn't that bad but what really threw me over was the fact that they cuss in like every sentence. Is it that hard to express your anger without saying the F word every time?The cussing was annoying and the whole flashy, camera shaking thing gave me a headache.<br /><br />All in all, although the plot was OK, I found the film to be a bore and over dramatic. That's why I only cut to scenes with CHACE in it. LOL Anyways, not worth renting unless your a die-hard fan of a specific cast member like I was. Oh yeah the cast was Hot. The girls were HOT!!! But CHACE IS THE BEST!!")),modelw2v)

NameError: name 'model' is not defined

In [None]:
print(clean_query("I basically skimmed through the movie but just enough to catch watch the plot was about. To tell you the truth it was kind of boring to me and at some spots it didn't make sense. The only reason I watched this movie in the first place was to see CHACE CRAWFORD!!! He is so hot, but in this movie his hair was kind of weird. But still hot.<br /><br />However, despite how hot CHACE is, it really did not make up for the film. I guess the plot isn't that bad but what really threw me over was the fact that they cuss in like every sentence. Is it that hard to express your anger without saying the F word every time?The cussing was annoying and the whole flashy, camera shaking thing gave me a headache.<br /><br />All in all, although the plot was OK, I found the film to be a bore and over dramatic. That's why I only cut to scenes with CHACE in it. LOL Anyways, not worth renting unless your a die-hard fan of a specific cast member like I was. Oh yeah the cast was Hot. The girls were HOT!!! But CHACE IS THE BEST!!"))

In [None]:
stopwords=stopwords.words('english')
print (stopwords)

In [None]:
string.punctuation

In [None]:
with open(r'C:\Users\gal73\Desktop\train\neg\0_3.txt') as file:
    print(file.read())
#train=pd.read_table('\Desktop\train\neg\0_3.txt')


In [None]:
lstm = LSTMLayer(input_size=100, hidden_size=50)
# outputs, hidden_states = lstm(input_embeddings)