In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow_hub as hub
import tensorflow as tf

In [None]:
df = pd.read_csv('wine-reviews.csv', usecols=['country','description','points','price','variety','winery'])

In [None]:
df.head(3)

In [None]:
df = df.dropna(subset=['description','points'])

In [None]:
plt.hist(df.points, bins = 20)
plt.ylabel('N')
plt.xlabel('Points')
plt.show()

In [None]:
df1 = pd.read_csv('diabetes.csv')

In [None]:
df1.head()

In [None]:
df1.corr()

##### Checking to see if both categories contain the same number of values

In [None]:
len(df1[df1['Outcome']==1]), len(df1[df1['Outcome']==0])

##### Histograms for each category depending on the features. Scaled because of the difference in values above

In [None]:
for i in range(len(df1.columns[:-1])):
    label = df1.columns[i]
    plt.hist(df1[df1['Outcome']==1][label], color='blue', label='Diabetes', alpha=0.7, density=True, bins=15)
    plt.hist(df1[df1['Outcome']==0][label],color = 'red', label='No diabetes', alpha=0.7, density=True, bins=15)
    plt.title(label)
    plt.ylabel("Probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

##### Splitting dataset into features and output variables

In [None]:
x = df1[df1.columns[:-1]].values
y = df1[df1.columns[-1]].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler 

##### Scaling values because of difference in feature ranges

In [None]:
scaler = StandardScaler()
x = scaler.fit_transform(x)
data = np.hstack((x,np.reshape(y, (-1,1))))
transformed_df = pd.DataFrame(data, columns=df1.columns) 

In [None]:
over=RandomOverSampler()
x,y = over.fit_resample(x,y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

##### Splitting into training and testing dataset

In [None]:
x_train, x_temp, y_train, y_temp = train_test_split(x,y, test_size=0.4, random_state=0)
x_valid, x_test, y_valid, y_test = train_test_split(x_temp,y_temp, test_size=0.5, random_state=0)

##### Creation of feedforward neural net

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu'), #if x <0 --> 0, else x
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(optimizer=tf.test.optimizers.Adam(learning_rate=0.001),
loss = tf.keras.losses.BinaryCrossentropy,
metrics=['accuracy'])

In [None]:
model.evaluate(x_train, y_train)

In [None]:
model.evaluate(x_valid, y_valid)

In [None]:
model.fit(x_train, y_train, batch_size=16, epochs=20, validation_data=(x_valid, y_valid))

In [None]:
model.evaluate(x_test, y_test)

##### Text classification

In [None]:
df.head(2)

##### Classifying bbased on whether value is above or below average

In [None]:
df["label"] = (df.points >=90).astype(int)
df = df[["description",'label']]


##### Splitting dataset into training, validation, and testing datasets

In [None]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)),int(.9*len(df))])

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
    '''Converts each dataset into a tf.data.Dataset then shuffles 
    and batches the data. '''
  df = dataframe.copy()
  labels = df.pop('label')
  df = df["description"]
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

In [None]:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

##### Embedding and model Model

In [None]:
embedding = 'https://tfhub.dev/google/nnlm-en-dim50/2'
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable= True)

In [None]:
hub_layer(list(train_data)[0][0])

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=.001),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics = ['accuracy'])

In [None]:
model.evaluate(train_data)

In [None]:
model.evaluate(valid_data)

In [None]:
history = model.fit(train_data, epochs=10, validation_data = valid_data)

As training happens validation loss decreases, this can be an example of overfitting. Thus, the model fits the training data very well, but isn't good at generalizing. The model needs to be modified to improve performance. 

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4)) #added this layer
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=.001),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics = ['accuracy'])

In [None]:
model.evaluate(train_data)
model.evaluate(valid_data)
history = model.fit(train_data, epochs=5, validation_data = valid_data)
#reduced the number of epochs 

In [None]:
model.evaluate(test_data)