In [None]:
from IPython.core.debugger import set_trace

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

plt.style.use(style="seaborn")
%matplotlib inline

In [None]:
csv_file = ('/Users/nurhandeakyuz/Desktop/nlpP3/archive-2/kickstarter_data_with_features.csv')
dataframe = pd.read_csv(csv_file)

In [None]:
import re


def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

In [None]:
def remove_number(text):
    newstring = re.sub(r'[0-9]+', '', text)
    return newstring

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)

In [None]:
import string


def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

In [None]:
dataframe['blurb']=dataframe['blurb'].astype("string")

In [None]:
dataframe = dataframe[['blurb','state']]

In [None]:
dataframe = dataframe[dataframe['state'].isin(['successful', 'failed'])]

In [None]:
dataframe['state'] = dataframe['state'].replace({'failed': 0, 'successful': 1})

In [None]:
import nltk

In [None]:
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))


def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)

In [None]:
dataframe['blurb'] = dataframe['blurb'].map(remove_stopwords)

In [None]:
dataframe = dataframe.dropna()

In [None]:
from sklearn.model_selection import train_test_split
X = dataframe.drop('state', axis=1)
Y = dataframe.state
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=123)

In [None]:
text_train = X_train.blurb

In [None]:
from collections import Counter
# Count unique words
def counter_word(text):
    count = Counter()
    for i in text.values:
        i = remove_punct(i)
        i = remove_emoji(i)
        i = remove_URL(i)
        i = remove_html(i)
        i = remove_number(i)
        for word in i.split():
            count[word] += 1
    return count

In [None]:
counter = counter_word(text_train)

In [None]:
num_words = len(counter)

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(text_train)

In [None]:
word_index = tokenizer.word_index

In [None]:
train_sequences = tokenizer.texts_to_sequences(text_train)

In [None]:
def find_max_list(list):
    list_len = [len(i) for i in list]
    print(max(list_len))
max_length = find_max_list(train_sequences)

In [None]:
from keras.preprocessing.sequence import pad_sequences

train_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding="post", truncating="post"
)

In [None]:
train_padded[0]

In [None]:
text_test = X_test.blurb

In [None]:
test_sequences = tokenizer.texts_to_sequences(text_test)

In [None]:
max_length_test = find_max_list(test_sequences)

In [None]:
test_padded = pad_sequences(
    test_sequences, maxlen=max_length_test, padding="post", truncating="post"
)

In [None]:
print(X_train.blurb[0])
print(train_sequences[0])

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [None]:
def decode(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [None]:
decode(train_sequences[0])

In [None]:
print(f"Shape of train {train_padded.shape}")
print(f"Shape of test {test_padded.shape}")

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

model = Sequential()

model.add(Embedding(num_words, 32, input_length=max_length))
model.add(LSTM(64, dropout=0.1))
model.add(Dense(1, activation="sigmoid"))


optimizer = Adam(learning_rate=3e-4)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
history = model.fit(
    train_padded, y_train, epochs=2, validation_data=(test_padded, y_test),
)

In [None]:
results = model.evaluate(test_padded, y_test, batch_size=128)