# IMPORTANT CODE SNIPPETS
One stop destination for all code snippets,
which are useful for processes in ML.

The code gets updated from time to time as
I learn more and more techniques and 
document them.

Happy Machine Learning.
We are the future.

@author: Bikram Dutta

## Word Frequecy in text

In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer

sampleTextList = [['rocky','baby','rocky'], ['rocky','rocky', 'rambo', 'rambo'], ['adrian', 'rocky', 'adrian', 'rambo']]


cv = CountVectorizer()
newList = []
for sampleList in sampleTextList:
    newList.append(' '.join(sampleList))
    
sampleTextList = newList

# for just count matrix
cv_fit = cv.fit_transform(sampleTextList)

print(cv.get_feature_names())
print(cv_fit.toarray())

wordList = cv.get_feature_names()
countList = cv_fit.toarray().sum(axis=0)
wordCountMap = dict(zip(wordList,countList))

print(wordCountMap)

# for every text in the list

for text in sampleTextList:
    cv = CountVectorizer()
    cv_fit = cv.fit_transform(text.split())
    wordList = cv.get_feature_names()
    countList = cv_fit.toarray().sum(axis=0)
    wordCountMap = dict(zip(wordList,countList))
    print(wordCountMap)



['adrian', 'baby', 'rambo', 'rocky']
[[0 1 0 2]
 [0 0 2 2]
 [2 0 1 1]]
{'adrian': 2, 'baby': 1, 'rambo': 3, 'rocky': 5}
{'baby': 1, 'rocky': 2}
{'rambo': 2, 'rocky': 2}
{'adrian': 2, 'rambo': 1, 'rocky': 1}


In [2]:
import numpy
arr = numpy.array(newList)
print('Original Numpy Array : ' , arr)
 
# Get a tuple of unique values & their frequency in numpy array
uniqueValues, occurCount = numpy.unique(arr, return_counts=True)

valueDict = dict(zip(uniqueValues, occurCount))
 
# print("Unique Values : " , uniqueValues)
# print("Occurrence Count : ", occurCount)
valueDict

Original Numpy Array :  ['rocky baby rocky' 'rocky rocky rambo rambo' 'adrian rocky adrian rambo']


{'adrian rocky adrian rambo': 1,
 'rocky baby rocky': 1,
 'rocky rocky rambo rambo': 1}

In [3]:
sampleList = [['rocky','baby','rocky'], ['rocky','rocky', 'rambo', 'rambo'], ['adrian', 'rocky', 'adrian', 'rambo']]
newList = []
for sample in sampleList:
    newList.extend(sample)
    
newList

['rocky',
 'baby',
 'rocky',
 'rocky',
 'rocky',
 'rambo',
 'rambo',
 'adrian',
 'rocky',
 'adrian',
 'rambo']

In [4]:
wordCountDf = pd.DataFrame(list(zip(wordList,countList)),columns =['Word', 'Count']) 
wordCountDf.sort_values(by=['Count'], inplace=True, ascending = False)
wordCountDf.reset_index(inplace=True, drop=True)
wordCountDf
# list(wordCountDf['Word'])

Unnamed: 0,Word,Count
0,adrian,2
1,rambo,1
2,rocky,1


In [5]:
sampleTextList = [['rocky baby rocky', 'rocky rocky rambo rambo'], ['adrian rocky adrian rambo']]
wordList = []
list(map(wordList.extend, sampleTextList))
print(wordList)

['rocky baby rocky', 'rocky rocky rambo rambo', 'adrian rocky adrian rambo']


## Cosine Similarity

In [6]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer

sampleTextList = ['rocky baby rocky', 'rocky rocky rambo rambo', 'adrian rocky adrian rambo']
cv = CountVectorizer()

# for just count matrix
cv_fit = cv.fit_transform(sampleTextList)

from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(cv_fit)

print(similarity_scores)

[[1.         0.63245553 0.36514837]
 [0.63245553 1.         0.57735027]
 [0.36514837 0.57735027 1.        ]]


# n-gram Containment

In [1]:
import numpy as np
import sklearn

from sklearn.feature_extraction.text import CountVectorizer

a_text = "This is an answer text"
s_text = "This is a source text"

# set n
n = 1

# instantiate an ngram counter
    counts = CountVectorizer(analyzer='word', ngram_range=(n,n))

# create a dictionary of n-grams by calling `.fit`
vocab2int = counts.fit([a_text, s_text]).vocabulary_

# print dictionary of words:index
print(vocab2int)

# create a vocabulary for 2-grams
# counts_2grams = None
    counts_2grams = CountVectorizer(analyzer='word', ngram_range=(n+1,n+1))
# create a dictionary of 2-grams by calling `.fit`
vocabFor2grams = counts_2grams.fit([a_text, s_text]).vocabulary_
print(vocabFor2grams)


# create array of n-gram counts for the answer and source text
ngrams = counts.fit_transform([a_text, s_text])
print(ngrams)
# row = the 2 texts and column = indexed vocab terms (as mapped above)
# ex. column 0 = 'an', col 1 = 'answer'.. col 4 = 'text'
ngram_array = ngrams.toarray()
print(ngram_array)

def containment(ngram_array):
    ''' Containment is a measure of text similarity. It is the normalized, 
       intersection of ngram word counts in two texts.
       :param ngram_array: an array of ngram counts for an answer and source text.
       :return: a normalized containment value.'''
    temp = list(ngram_array)
    count = len(temp[0])
    commonTerms = 0
    commonIndexes = []
    while count>0:
        index = len(temp[0]) - count
        if temp[0][index] == temp[1][index]:
            commonTerms +=1
            commonIndexes.append(index)
        count = count -1 
    
    containment = commonTerms / np.count_nonzero(temp[0] > 0)
    
    # your code here
    
    return containment


{'this': 5, 'is': 2, 'an': 0, 'answer': 1, 'text': 4, 'source': 3}
{'this is': 5, 'is an': 2, 'an answer': 0, 'answer text': 1, 'is source': 3, 'source text': 4}
  (0, 5)	1
  (0, 2)	1
  (0, 0)	1
  (0, 1)	1
  (0, 4)	1
  (1, 5)	1
  (1, 2)	1
  (1, 4)	1
  (1, 3)	1
[[1 1 1 0 1 1]
 [0 0 1 1 1 1]]


In [2]:
# test out your code
containment_val = containment(ngrams.toarray())

print('Containment: ', containment_val)

# note that for the given texts, and n = 1
# the containment value should be 3/5 or 0.6
assert containment_val==0.6, 'Unexpected containment value for n=1.'
print('Test passed!')


# test for n = 2
counts_2grams = CountVectorizer(analyzer='word', ngram_range=(2,2))
bigram_counts = counts_2grams.fit_transform([a_text, s_text])

# calculate containment
containment_val = containment(bigram_counts.toarray())

print('Containment for n=2 : ', containment_val)

# the containment value should be 1/4 or 0.25
assert containment_val==0.25, 'Unexpected containment value for n=2.'
print('Test passed!')


NameError: name 'containment' is not defined

In [None]:
n = 1
answer_filename = 'g0pA_taska.txt'

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
counts = CountVectorizer(analyzer='word', ngram_range=(n,n))
vocab = counts.fit(list(complete_df.Text)).vocabulary_
# print(vocab)
ngrams = counts.fit_transform(list(complete_df.Text))
# print(ngrams)
ngram_array = ngrams.toarray()


complete_df['ngrams'] = list(ngram_array)

def getSourceNgram(complete_df,answer_filename):
    task = list(complete_df[complete_df.File == answer_filename]['Task'])[0]
    print(task)
    source_ngram = list(complete_df[(complete_df['Task'] == task) & (complete_df.Class == -1)].ngrams)[0].tolist()
#     print('Source ngram: ', source_ngram)
    return source_ngram

def getNgramsForText(complete_df,answer_filename):
    return list(complete_df[complete_df.File == answer_filename].ngrams)[0].tolist()

resultant = []
for element in ngram_array:
    if resultant == []:
        resultant = element
    
    else:
        resultant = np.bitwise_and(resultant,element)

globalCommonTerms = np.count_nonzero(resultant>0)

ngramsForAnswer = getNgramsForText(complete_df,answer_filename)
ngramsForSource = getSourceNgram(complete_df,answer_filename)
commonbetweenSrcAndAns = np.bitwise_and(np.array(ngramsForAnswer),np.array(ngramsForSource))

containment = np.count_nonzero(commonbetweenSrcAndAns > 0) / np.count_nonzero(np.array(ngramsForAnswer) > 0)

print(len(commonbetweenSrcAndAns),len(ngramsForSource), len(resultant))

In [None]:
ngram_array = [[1,1,1,0,1],
              [1,0,1,1,1],
              [1,1,0,0,1],
              [1,1,1,1,1]]

temp = list(ngram_array)
count = len(temp[0])
commonTerms = 0
commonIndexes = []
while count>0:
    index = len(temp[0]) - count
    if temp[0][index] == temp[1][index]:
        commonTerms +=1
        commonIndexes.append(index)
    count = count -1 

containment = commonTerms / np.count_nonzero(temp[0] > 0)



In [30]:
import numpy as np
ngram_array = [[1,1,1,0,1],
              [1,0,1,1,1],
              [1,1,0,0,1],
              [1,1,1,1,1]]

resultant = []
for element in ngram_array:
    if resultant == []:
        resultant = element
    
    else:
        resultant = np.bitwise_and(resultant,element)
        
np.count_nonzero(resultant>0)

# np.bitwise_and(ngram_array[0:2])

  if __name__ == '__main__':


2

## Get a list of categorical variables

In [None]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

## One hot encoding

In [None]:

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)