In [1]:
from IPython.core.debugger import set_trace
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import re
import string
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from collections import Counter
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam
plt.style.use(style="seaborn")
%matplotlib inline

In [None]:
## source files combiner script

from os import path, getcwd, listdir
from os.path import join

## all source "folders" should be in resource folder ( e.g ./resource/kickstarter...)
current_path = join( getcwd(), "resource")
## all files will be combined to all_data.csv
combined_data_set_path = join( getcwd(), "kickstarter_data_set", "all_data.csv" )

combined_file = open(combined_data_set_path, "w", encoding="UTF-8")

dirs = listdir(current_path)

for direc in dirs:
    c_dir_path = join(current_path, direc)
    f_count = 0
    for f in listdir(c_dir_path):
        f_count +=1
        print(f"Filfe -> {f_count}")
        f_ = open(join(c_dir_path, f), "r", encoding="UTF-8")
        count = 0
        for line in f_.readlines():
            count+=1
            if f_count == 1 and count == 1:
                combined_file.write(line)

            if count==1:
                continue
            else:
                combined_file.write(line)
                combined_file.flush()

combined_file.close()

In [4]:
all_df = pd.DataFrame()
for path, subdirs, files in os.walk('resource'):
    for name in files:
        csv_file_path = os.path.join(path, name)
        df = pd.read_csv(csv_file_path)
        all_df = all_df.append(df)

In [5]:
dataframe = all_df.copy()

### Loading the Data

In [None]:
# csv_file_path = ('data/kickstarter_data_with_features.csv')
# dataframe = pd.read_csv(csv_file_path)

### Defining  some functions

In [6]:
# Multiple functions for cleaning data 

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

def remove_number(text):
    newstring = re.sub(r'[0-9]+', '', text)
    return newstring


def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)


def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

def remove_stopwords(text):
    # nltk.download('stopwords') 
    stop = set(stopwords.words("english"))
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)


def get_vocab_size(text):
    """Count unique words"""
    count = Counter()
    for i in text.values:
        i = remove_punct(i)
        i = remove_emoji(i)
        i = remove_URL(i)
        i = remove_html(i)
        i = remove_number(i)
        for word in i.split():
            count[word] += 1
    return len(count)

### Preprocessing the data

In [7]:
dataframe['blurb']=dataframe['blurb'].astype("string") 
dataframe = dataframe[['blurb','state', 'pledged']] # only using these 2 cols
dataframe = dataframe[dataframe['state'].isin(['successful', 'failed'])]
dataframe['state'] = dataframe['state'].replace({'failed': 0, 'successful': 1})
dataframe = dataframe.dropna() 
dataframe['blurb'] = dataframe['blurb'].map(remove_stopwords) # remove stop words

In [8]:
dataframe

Unnamed: 0,blurb,state,pledged
0,"aquaponic system able grow large plants, insid...",0,23.0
1,creating artistic edibles sweet twist. please ...,0,30.0
2,bicycle® deluxe. 56 luxury hand-illustrated pl...,1,27473.0
3,pup pops water-based natural pops dogs healthy...,0,525.0
4,limited time original works anime-style art fr...,1,1131.0
...,...,...,...
3649,"back moon little star, cute illustrated book y...",1,473.5
3650,"contiene una foto per ogni giorno dell’anno, l...",1,10049.0
3651,anti-bullying rhyme picture book ugliest dog l...,1,35828.0
3652,i'm raising funds publish print first book poe...,0,409.0


### Splitting the Data

In [9]:
from sklearn.model_selection import train_test_split
X = dataframe.drop(['state', 'pledged'], axis=1)
Y = dataframe[['state', 'pledged']]
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=123)

### Getting the Vocabular size: # of unique words

In [10]:
train_v_size = get_vocab_size(X_train.blurb)

In [11]:
train_v_size

108941

### Sequencing the text

Fitting a tokenizer to assign indices to words and converting the text to sequence of indices.

In [12]:
#fit tokenizer on training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train.blurb)
#get train sequences
train_seqs = tokenizer.texts_to_sequences(X_train.blurb)
train_seqs_max_size = max([len(seq) for seq in train_seqs])
#get test sequences
test_seqs = tokenizer.texts_to_sequences(X_test.blurb)
test_seqs_max_size = max([len(seq) for seq in test_seqs])

In [13]:
train_seqs_max_size, test_seqs_max_size

(42, 28)

### Padding the sequences

Padding the sequences to have seqencies of equal length. 

In [14]:
train_padded = pad_sequences(train_seqs, maxlen=train_seqs_max_size, padding="post", truncating="post")
test_padded = pad_sequences(test_seqs, maxlen=train_seqs_max_size, padding="post", truncating="post")

In [15]:
print(f"Shape of train {train_padded.shape}")
print(f"Shape of test {test_padded.shape}")

Shape of train (145954, 42)
Shape of test (62552, 42)


### Embeddings

In [16]:
documents = X_train.blurb
X_train_tokenized = [[word for word in document.lower().split()] for document in documents]

#### Word2Vec

In [17]:
from gensim.models import Word2Vec, FastText
word_model = Word2Vec(X_train_tokenized, vector_size=100)

#build matrix 
embedding_matrix_w2v = np.random.random(((train_v_size) + 1, 100))
for word,i in tokenizer.word_index.items():  
    try:
        embedding_matrix_ft[i] = word_model.wv[word]
    except:
        pass

# create layer
embedding_layer_w2v = Embedding((train_v_size) + 1, output_dim=100, 
                            weights=[embedding_matrix_w2v], trainable=True)



#### FastText

In [18]:
ft = FastText(vector_size=300)
ft.build_vocab(X_train_tokenized)
ft.train(tokenizer.word_index, total_examples=ft.corpus_count, epochs=10)

# build matrix
embedding_matrix_ft = np.random.random(((train_v_size) + 1, ft.vector_size))
for word,i in tokenizer.word_index.items(): 
    try:
        embedding_matrix_ft[i] = ft.wv[word]
    except:
        pass

# create layer
embedding_layer_ft = Embedding((train_v_size) + 1, output_dim=300, 
                            weights=[embedding_matrix_ft], trainable=True)

#### Keras Embeddings 

In [19]:
embedding_layer_keras = Embedding(train_v_size, output_dim=100, input_length=train_seqs_max_size)

### Modeling

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D
from keras.initializers import Constant
from keras.optimizers import Adam, SGD
from tensorflow.keras.regularizers import L2

In [None]:
def lstm_model(embeddings, classification=True):
    model = Sequential()
    model.add(embeddings)
    model.add(LSTM(20, dropout=.9))
    model.add(Dense(1, activation="sigmoid"))

    adam_opt = Adam(learning_rate=3e-4)
    if classification:
        model.compile(loss="binary_crossentropy", optimizer=adam_opt, metrics=["accuracy"])
    else: 
        model.compile(loss="mean_squared_error", optimizer=adam_opt, metrics=["mse"])
        
    return model 

def train_model(model, train_padded, test_padded, y_train, y_test):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit( train_padded, y_train, epochs=20,
                            validation_data=(test_padded, y_test), callbacks=[early_stopping])
    
    return history
    
    
def evaluate_model(model, test_padded, y_test):
    results = model.evaluate(test_padded, y_test, batch_size=128)
    return results 

def plot_history(history):
    metrics_df = pd.DataFrame(history.history)
    metrics_df[["loss","val_loss"]].plot()
    

### LSTM

#### LSTM -  Predict if a project/campaign will be successful or not

##### Keras Embeddings

In [None]:
model = lstm_model(embedding_layer_keras)    
history = train_model(model, train_padded, test_padded, y_train['state'], y_test['state'])

In [None]:
keras_lstm_status_history = pd.DataFrame(history.history)
keras_lstm_status_history

In [None]:
evaluate_model(model, test_padded, y_test['state'])

In [None]:
plot_history(history)

##### Word2Vec 

In [None]:
model = lstm_model(embedding_layer_w2v)    
history = train_model(model, train_padded, test_padded, y_train['state'], y_test['state'])

In [None]:
w2v_lstm_status_history = pd.DataFrame(history.history)
w2v_lstm_status_history

In [None]:
evaluate_model(model, test_padded, y_test['state'])

In [None]:
plot_history(history)

##### FastText

In [None]:
model = lstm_model(embedding_layer_ft)    
history = train_model(model, train_padded, test_padded, y_train['state'], y_test['state'])

In [None]:
ft_lstm_status_history = pd.DataFrame(history.history)
ft_lstm_status_history

In [None]:
evaluate_model(model, test_padded, y_test['state'])

In [None]:
plot_history(history)

#### LSTM - Predict the amount of money collected

In [None]:
model = lstm_model(embedding_layer_keras, classification=False)    
history = train_model(model, train_padded, test_padded, y_train['pledged'], y_test['pledged'])

In [None]:
evaluate_model(model, test_padded, y_test['pledged'])

In [None]:
plot_history(history)

### CNN

In [None]:
def cnn_model(embeddings, classification=True):
    model = Sequential()
    model.add(embeddings)
    model.add(Conv1D(10, 3, activation='relu', kernel_regularizer=L2(0.1), bias_regularizer=L2(0.9)))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation='sigmoid'))

    adam_opt = Adam(learning_rate=3e-4)
    if classification:
        model.compile(loss="binary_crossentropy", optimizer=adam_opt, metrics=["accuracy"])
    else: 
        model.compile(loss="mean_squared_error", optimizer=adam_opt, metrics=["mse"])
        
    return model 

#### CNN -  Predict if a project/campaign will be successful or not

##### Keras Embeddings 

In [None]:
model = cnn_model(embedding_layer_keras)    
history = train_model(model, train_padded, test_padded, y_train['state'], y_test['state'])

In [None]:
keras_cnn_status_history = pd.DataFrame(history.history)
keras_cnn_status_history

In [None]:
evaluate_model(model, test_padded, y_test['state'])

In [None]:
plot_history(history)

##### Word2Vec

In [None]:
model = cnn_model(embedding_layer_w2v)    
history = train_model(model, train_padded, test_padded, y_train['state'], y_test['state'])

In [None]:
w2v_cnn_status_history = pd.DataFrame(history.history)
w2v_cnn_status_history

In [None]:
evaluate_model(model, test_padded, y_test['state'])

In [None]:
plot_history(history)

##### Fasttext

In [None]:
model = cnn_model(embedding_layer_ft)    
history = train_model(model, train_padded, test_padded, y_train['state'], y_test['state'])

In [None]:
ft_cnn_status_history = pd.DataFrame(history.history)
ft_cnn_status_history

In [None]:
evaluate_model(model, test_padded, y_test['state'])

In [None]:
plot_history(history)

#### CNN - Predict the amount of money collected

In [None]:
model = lstm_model(embedding_layer_keras, classification=False)    
history = train_model(model, train_padded, test_padded, y_train['pledged'], y_test['pledged'])

In [None]:
evaluate_model(model, test_padded, y_test['pledged'])

In [None]:
plot_history(history)

### References

https://radimrehurek.com/gensim/models/word2vec.html

https://keras.io/api/