<a href="https://colab.research.google.com/github/arooshiverma/API_demo/blob/master/Colab_fuzzy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## More Resources

Learn how to make the most of Python, Jupyter, Colaboratory, and related tools with these resources:

### Working with Notebooks in Colaboratory
- [Overview of Colaboratory](/notebooks/basic_features_overview.ipynb)
- [Guide to Markdown](/notebooks/markdown_guide.ipynb)
- [Importing libraries and installing dependencies](/notebooks/snippets/importing_libraries.ipynb)
- [Saving and loading notebooks in GitHub](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)
- [Interactive forms](/notebooks/forms.ipynb)
- [Interactive widgets](/notebooks/widgets.ipynb)

### Working with Data
- [Loading data: Drive, Sheets, and Google Cloud Storage](/notebooks/io.ipynb) 
- [Charts: visualizing data](/notebooks/charts.ipynb)
- [Getting started with BigQuery](/notebooks/bigquery.ipynb)

### Machine Learning Crash Course
These are a few of the notebooks from Google's online Machine Learning course. See the [full course website](https://developers.google.com/machine-learning/crash-course/) for more.
- [Intro to Pandas](/notebooks/mlcc/intro_to_pandas.ipynb)
- [Tensorflow concepts](/notebooks/mlcc/tensorflow_programming_concepts.ipynb)
- [First steps with TensorFlow](/notebooks/mlcc/first_steps_with_tensor_flow.ipynb)
- [Intro to neural nets](/notebooks/mlcc/intro_to_neural_nets.ipynb)
- [Intro to sparse data and embeddings](/notebooks/mlcc/intro_to_sparse_data_and_embeddings.ipynb)

### Using Accelerated Hardware
- [TensorFlow with GPUs](/notebooks/gpu.ipynb)
- [TensorFlow with TPUs](/notebooks/tpu.ipynb)

In [1]:
from google.colab import files
uploaded = files.upload()

Saving finalflexdata.txt to finalflexdata.txt


In [5]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz#egg=en_core_web_lg==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
[K     |████████████████████████████████| 852.3MB 6.4MB/s 
[?25hBuilding wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Stored in directory: /tmp/pip-ephem-wheel-cache-np6bc5j_/wheels/0d/bc/67/e6a9108ab86cd076703af19ad4e0f02f57381ac6583df16249
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_lg -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')



In [6]:
!pip install pyLDAvis



In [0]:
import csv
import os
import json
import spacy
import pandas as pd
import itertools as it
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import os
import codecs
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import _pickle as pickle
#spacy.require_gpu()

nlp = spacy.load('en_core_web_lg')

global review_txt_filepath
global intermediate_director

In [0]:
global review_txt_filepath
global intermediate_directory
review_txt_filepath='finalflexdata.txt'
intermediate_directory=''

In [0]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])
							 
global unigram_sentences_filepath
unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_allm.txt')
										  
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)
		
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print (u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print (u'{:20} {:.3f}'.format(term, round(frequency, 3)))

		
		
def get_sample_review(review_number,review_txt_filepath):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]
						  
						  
						  
						  
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.stop_words.STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda = sorted(review_lda, key=lambda topic_number_and_freq: -topic_number_and_freq[1])
    
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print ('{:25} {}'.format(topic_names[topic_number],
                                round(freq, 3)))
								
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = food2vecm.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print (term)
		
		
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in food2vecm.most_similar(positive=[token], topn=topn):

        print (u'{:20} {}'.format(word, round(similarity, 3)))


In [10]:
with codecs.open(review_txt_filepath, encoding='utf_8') as f:
    sample_review = list(it.islice(f, 8, 9))[0]
    sample_review = sample_review.replace('\\n', '\n')
print (sample_review)
parsed_review = nlp(sample_review)
print(parsed_review)

Weatherford International PLC or WEATHERFORD INTERNATIONAL LTD or Weatherford International PLC has website www.weatherford.com  belogs to  category Energy  group Oil & Gas Equipment & Services sector Basic Materials.Weatherford International public limited company, together with its subsidiaries, operates as a multinational oilfield service company worldwide. It offers equipment and services used in the drilling, evaluation, completion, production, and intervention of oil and natural gas wells. The company operates through three business groups: Formation Evaluation and Well Construction, Completion and Production, and Land Drilling Rigs. The Formation Evaluation and Well Construction business group provides managed-pressure drilling, drilling services, tubular running services, drilling tools and rental equipment, wireline services, testing and production services, re-entry and fishing services, cementing products, liner systems, reservoir solutions, and surface logging systems. The 

In [11]:
for num, sentence in enumerate(parsed_review.sents):
    print ('Sentence {}:'.format(num + 1))
    print (sentence)
    print ('')
	
for num, entity in enumerate(parsed_review.ents):
    print ('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print ('')

Sentence 1:
Weatherford International PLC or WEATHERFORD INTERNATIONAL LTD or Weatherford International PLC has website www.weatherford.com  

Sentence 2:
belogs to  category Energy  group Oil & Gas Equipment & Services sector Basic Materials.

Sentence 3:
Weatherford International public limited company, together with its subsidiaries, operates as a multinational oilfield service company worldwide.

Sentence 4:
It offers equipment and services used in the drilling, evaluation, completion, production, and intervention of oil and natural gas wells.

Sentence 5:
The company operates through three business groups: Formation Evaluation and Well Construction, Completion and Production, and Land Drilling Rigs.

Sentence 6:
The Formation Evaluation and Well Construction business group provides managed-pressure drilling, drilling services, tubular running services, drilling tools and rental equipment, wireline services, testing and production services, re-entry and fishing services, cementing 

In [12]:
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_pos),
             columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,Weatherford,PROPN
1,International,PROPN
2,PLC,PROPN
3,or,CCONJ
4,WEATHERFORD,PROPN
5,INTERNATIONAL,PROPN
6,LTD,PROPN
7,or,CCONJ
8,Weatherford,PROPN
9,International,PROPN


In [0]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_review]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])


In [14]:
df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
df   

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,Weatherford,-16.338444,,,,,
1,International,-11.928836,,,,,
2,PLC,-15.370662,,,,,
3,or,-5.654985,,,,,
4,WEATHERFORD,-19.502029,,,,,
5,INTERNATIONAL,-16.004759,,,,,
6,LTD,-15.365042,,,,,
7,or,-5.654985,,,,,
8,Weatherford,-16.338444,,,,,
9,International,-11.928836,,,,,


In [0]:
with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(review_txt_filepath):
            f.write(sentence + '\n')
			
unigram_sentences = LineSentence(unigram_sentences_filepath)
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print (u' '.join(unigram_sentence))
    print (u'')

In [0]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_allm')

bigram_model = Phrases(unigram_sentences)

bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)


bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_allm.txt')
						
with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')
			
			
bigram_sentences = LineSentence(bigram_sentences_filepath)

for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print (u' '.join(bigram_sentence))
    print (u'')

In [0]:
trigram_model_filepath = os.path.join(intermediate_directory,
                                      'trigram_model_allm')
									  
trigram_model = Phrases(bigram_sentences)

trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)


trigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'trigram_sentences_allm.txt')
										  
										  
with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

In [0]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print (u' '.join(trigram_sentence))
    print (u'')

In [0]:
trigram_reviews_filepath = os.path.join(intermediate_directory,
                                        'trigram_transformed_reviews_allm.txt')
with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.lang.en.stop_words.STOP_WORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')
print (u'Original:' + u'\n')
for review in it.islice(line_review(review_txt_filepath), 11, 12):
    print (review)

print (u'----' + u'\n')
print (u'Transformed:' + u'\n')

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print (review)

In [0]:
trigram_dictionary_filepath = os.path.join(intermediate_directory,
                                           'trigram_dict_allm.dict')

								
trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

trigram_bow_filepath = os.path.join(intermediate_directory,
                                    'trigram_bow_corpus_allm.mm')
MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_allm')

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
        
    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=3)
    
lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)
explore_topic(topic_number=0)

In [0]:
for i in range(50):
    print('topic:',i)
    explore_topic(topic_number=i)

In [0]:
topic_names = {0: u'company',
               1: u'product',
               2: u'energy',
               3: u'system',
               4: u'service',
               5: u'business',
               6: u'market',
               7: u'investment',
               8: u'media',
               9: u'brand',
               10: u'management',
               11: u'software',
               12: u'technology',
               13: u'minerals',
               14: u'sushi',
               15: u'industry',
               16: u'oil',
               17: u'price',
               18: u'operation',
               19: u'trust',
               20: u'airlines',
               21: u'equipment',
               22: u'water',
               23: u'real_estate',
               24: u'commercial',
               25: u'research',
               26: u'finance',
               27: u'healthcare',
               28: u'consumer_goods',
               29: u'banking',
               30: u'electronics',
               31: u'metals',
               32: u'defence',
               33: u'aerospace',
               34: u'drugs',
               35: u'food',
               36: u'wholesale',
               37: u'chemical',
               38: u'telecommunication',
               39: u'leisure',
               40: u'insurance',
               41: u'automotive',
               42: u'customer',
               43: u'retail',
               44: u'transport',
               45: u'utilities',
               46: u'construction',
               47: u'greek',
               48: u'manufacturing',
               49: u'energy'}

In [0]:
topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')

with open(topic_names_filepath, 'wb') as f:
    pickle.dump(topic_names, f)
	
	
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath,'rb') as f:
    LDAvis_prepared = pickle.load(f)
	
	
pyLDAvis.enable_notebook(LDAvis_prepared)


In [0]:
sample_review = get_sample_review(50,review_txt_filepath)
print (sample_review)

In [0]:
lda_description(sample_review)

In [0]:
from gensim.models import Word2Vec

trigram_sentences = LineSentence(trigram_sentences_filepath)
word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')

food2vecm = Word2Vec(trigram_sentences, size=100, window=5,
                        min_count=20, sg=1, workers=4)
    
food2vecm.save(word2vec_filepath)

In [0]:
for i in range(1,12):

        food2vecm.train(trigram_sentences,total_examples=food2vecm.corpus_count, epochs=food2vecm.iter)
        food2vecm.save(word2vec_filepath)
        
# load the finished model from disk
food2vecm = Word2Vec.load(word2vec_filepath)
food2vecm.init_sims()

In [0]:
print (u'{} training epochs so far.'.format(food2vecm.train_count))

print (u'{:,} terms in the food2vecm vocabulary.'.format(len(food2vecm.wv.vocab)))


In [0]:
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in food2vecm.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda term_and_index_and_count: -term_and_index_and_count[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vecm vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(food2vecm.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors