In [None]:
from __future__ import division
from importlib import import_module
from matplotlib import pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
import pickle
import pandas as pd
import copy
import string
import os
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from statsmodels.base.model import GenericLikelihoodModel
np.random.seed(123456789)

In [None]:
# !pip install nltk
# !pip install pandas
# !pip install numpy
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# !pip install ipython-autotime

# %load_ext autotime

# Ekstrak Data

In [None]:
title = 'doc.csv'

In [None]:
paths = []
for (dirpath, dirnames, filenames)in os.walk(str(os.getcwd())+'/'+title+'/'):
    for i in filenames:
        paths.append(str(dirpath)+str("/")+i)

In [None]:
paths

In [None]:
len(paths)

# Preprocessing data

In [None]:
def remove_header(data):
    try:
        ind = data.index('\n\n')
        data = data[ind:]
    except:
        print("No Header")
    return data

In [None]:

def convert_lower_case(data):
    return np.char.lower(data)

In [None]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

In [None]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [None]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [None]:
def remove_single_characters(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

In [None]:
def convert_numbers(data):
    data = np.char.replace(data, "0", " zero ")
    data = np.char.replace(data, "1", " one ")
    data = np.char.replace(data, "2", " two ")
    data = np.char.replace(data, "3", " three ")
    data = np.char.replace(data, "4", " four ")
    data = np.char.replace(data, "5", " five ")
    data = np.char.replace(data, "6", " six ")
    data = np.char.replace(data, "7", " seven ")
    data = np.char.replace(data, "8", " eight ")
    data = np.char.replace(data, "9", " nine ")
    return data

In [None]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return np.char.strip(new_text)

In [None]:
def preprocess(data, query):
    if not query:
        data = remove_header(data)        
    data = convert_lower_case(data)
    data = convert_numbers(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_stop_words(data)
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    data = stemming(data)
    return data

# Buat Unigram

In [None]:
postings = pd.DataFrame()
frequency = pd.DataFrame()
doc = 0

for path in paths:
    preprocessed_text = preprocess(path, False)
    if doc%100 == 0:
        print(doc)
    tokens = word_tokenize(str(preprocessed_text))

    pos = 0

    for token in tokens:
        if token in postings:
            p = postings[token][0]

            k = [a[0] for a in p]

            if doc in k:
                for a in p:
                    if a[0] == doc:
                        a[1].add(pos)
            else:
                p.append([doc,{pos}])
                frequency[token][0] += 1
        else:
            postings.insert(value=[[[doc, {pos}]]], loc=0, column=token)
            frequency.insert(value=[1], loc=0, column=token)
        pos += 1
    doc += 1

In [None]:
def get_word_postings(word):
 preprocessed_word = str(preprocess(word, True))
 print(preprocessed_word)
 print("Frequency:",frequency[preprocessed_word][0])
 print("Postings List:",postings[preprocessed_word][0])
 total=0
 #total
 for x in postings[preprocessed_word][0]:
    total+=len(x)
 #Print probability
 i=1
 for x in postings[preprocessed_word][0]:
    prob=len(x)/total
 print("Document ",i,":",prob)
 i+=1

In [None]:
lambda_ = 0.5
def zip_jm(x, lambda_=lambda_):
 if lambda_ < 0 or lambda_ > 1 :
    return np.zeros_like(x)
 else:
    return (x == 0) * (1-lambda_) * (kata/d) + lambda_ * c



zip_jm(paths)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
xs = np.arange(0, 10);
palette = sns.color_palette()
ax.bar(2.5 * xs, stats.poisson.pmf(xs, lambda_), width=0.9,
color=palette[0], label='Poisson');
ax.bar(2.5 * xs + 1, zip_pmf(xs), width=0.9, color=palette[1],
label='Zero-inflated Poisson');
ax.set_xticks(2.5 * xs + 1);
ax.set_xticklabels(xs);
ax.set_xlabel('$x$');
ax.set_ylabel('$P(X = x)$');
ax.legend();
N = 1000
inflated_zero = stats.bernoulli.rvs(pi, size=N)
x = (1 - inflated_zero) * stats.poisson.rvs(lambda_, size=N)
fig, ax = plt.subplots(figsize=(8, 6))
ax.hist(x, width=0.8, bins=np.arange(x.max() + 1), normed=True);
ax.set_xticks(np.arange(x.max() + 1) + 0.4);
ax.set_xticklabels(np.arange(x.max() + 1));
ax.set_xlabel('$x$');
ax.set_ylabel('Proportion of samples');