# 1. Fully connected neural network

In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score, classification_report, confusion_matrix


In [85]:
df = pd.read_csv('weatherHistory_v1.csv')
df = df.drop('Loud Cover',axis=1)
df = df.replace('?', np.NaN)
df = df.dropna()
df.isna().sum()

Formatted Date              0
Summary                     0
Precip Type                 0
Temperature (C)             0
Apparent Temperature (C)    0
Humidity                    0
Wind Speed (km/h)           0
Wind Bearing (degrees)      0
Visibility (km)             0
Pressure (millibars)        0
Daily Summary               0
dtype: int64

In [86]:
df['Formatted Date'] = pd.Categorical(df['Formatted Date'])
df['Summary'] = pd.Categorical(df['Summary'])
df['Precip Type'] = pd.Categorical(df['Precip Type'])
df['Daily Summary'] = pd.Categorical(df['Daily Summary'])

In [87]:
for column in df.select_dtypes(include='category').columns:
    if column=='Precip Type':
        continue
    df = pd.concat([df, pd.get_dummies(df[column], prefix=column)],axis=1)
    df.drop([column],axis=1, inplace=True)

In [88]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Precip Type',axis=1), df['Precip Type'].cat.codes, test_size=0.2, random_state=42)

In [89]:
for column in df.select_dtypes(include='float64').columns:
    scale = StandardScaler().fit(X_train[[column]])
    X_train[[column]] = scale.transform(X_train[[column]])
    X_test[[column]] = scale.transform(X_test[[column]])

In [94]:
model = Sequential()
model.add(Dense(32, input_dim=31811, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [95]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [96]:
model.fit(X_train,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x248d702f390>

In [97]:
y_pred = model.predict(X_test)> 0.5

In [103]:
f1_score(y_test, y_pred, average='micro')

0.00047355958958168905

In [104]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      1.00      0.00         3
           2       0.00      0.00      0.00        25
           3       0.00      0.00      0.00        32
           4       0.00      0.00      0.00        35
           5       0.00      0.00      0.00       895
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         6
           9       0.00      0.00      0.00       427
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00      1690
          13       0.00      0.00      0.00      1032
          14       0.00      0.00      0.00      2168
          15       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         3
          19       0.00    

In [105]:
print(confusion_matrix(y_test, y_pred))

[[   0    1    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0    3    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0   25    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0   32    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0   35    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0  895    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0    5    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0    6    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0  427    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0    2    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0    2    0    0    0    0    0    0    0    

### NN lost to AdaBoost and GBoost but outperformed all other models

# 2. CNN - in file 2_CNN.ipynb

# 3. RNN

In [125]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalMaxPooling1D, SimpleRNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [168]:
data = pd.read_csv('Sheet.csv')
data = data.drop('response_id',axis=1)
data = data.replace('not_flagged', 0)
data = data.replace('flagged', 1)
data = data.replace(np.NaN, '2')
data = data.astype({'class': 'int64'})
#data.dtypes
data.head(10)

Unnamed: 0,class,response_text
0,0,I try and avoid this sort of conflict
1,1,Had a friend open up to me about his mental ad...
2,2,2
3,0,i cant think of one really...i think i may hav...
4,2,2
5,0,a couple of years ago my friends was going to ...
6,1,Roommate when he was going through death and l...
7,1,i've had a couple of friends (you could say mo...
8,0,Listened to someone talk about relationship tr...
9,2,2


In [169]:
text = data['response_text'].values
labels = data['class'].values
text_train, text_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=42)
print(text_train.shape, text_test.shape)

(64,) (16,)


In [170]:
data.response_text.str.len().max()

386

In [171]:
vocab_size = 5000
maxlen = 135
embedding_size = 32

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
x_test = tokenizer.texts_to_sequences(text_test)

X_train = pad_sequences(X_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [172]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=maxlen))
model.add(SimpleRNN(32, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 135, 32)           160000    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 135, 32)           2080      
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 32)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_33 (Dense)             (None, 1)                 33        
Total params: 162,113
Trainable params: 162,113
Non-trainable params: 0
_________________________________________________________________


In [173]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=512, validation_data=(x_test, y_test), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [174]:
prediction = model.predict(x_test)>0.5
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 6.25%
