In [1]:
import numpy as np
import re
import nltk
import pickle
from nltk.corpus import stopwords
import pandas as pd

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv('speeches_spx_combined.csv')

In [4]:
df = df.dropna()

In [5]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove all the special characters
    text = re.sub(r'\W', ' ', str(text))
    
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) 
    
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # Converting to Lowercase
    text = text.lower()
    
    # Lemmatization
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    
    return text

In [6]:
df['text'] = df['text'].apply(preprocess_text)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(df['text']).toarray()

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [19]:
y = df['change']

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import  MLPClassifier
from sklearn.svm import SVC

In [22]:
models = {
    "Logistic": LogisticRegression(),
    "   KNN  ": KNeighborsClassifier(),
    "   RF   ": RandomForestClassifier(),
    "   SVC  ": SVC(),
}

print("-------- Training --------")
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained!")
print("---------- Done ----------")

-------- Training --------
Logistic trained!
   KNN   trained!
   RF    trained!
   SVC   trained!
---------- Done ----------


In [15]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

In [23]:
results = []
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(name + "    Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred) * 100))
    print("            F1 Score: {:.2f} %".format(f1_score(y_test, y_pred) * 100))
    print("              Recall: {:.2f} %".format(recall_score(y_test, y_pred) * 100))
    print("           Precision: {:.2f} %".format(precision_score(y_test, y_pred) * 100))
    print("-----------------------------")
    results.append(confusion_matrix(y_test, y_pred))

Logistic    Accuracy: 52.30 %
            F1 Score: 60.49 %
              Recall: 66.47 %
           Precision: 55.50 %
-----------------------------


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


   KNN      Accuracy: 51.97 %
            F1 Score: 53.80 %
              Recall: 50.90 %
           Precision: 57.05 %
-----------------------------
   RF       Accuracy: 50.66 %
            F1 Score: 57.39 %
              Recall: 60.48 %
           Precision: 54.59 %
-----------------------------
   SVC      Accuracy: 51.32 %
            F1 Score: 61.05 %
              Recall: 69.46 %
           Precision: 54.46 %
-----------------------------


In [24]:
from keras.models import Sequential
from keras import layers



NameError: name 'vocab_size' is not defined

In [26]:
input_dim = X_train.shape[1]  # Number of features

model = Sequential()
model.add(layers.Embedding(input_dim=10000, 
                           output_dim=1000,
                           input_length=10000))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 10000, 1000)       10000000  
                                                                 
 flatten_1 (Flatten)         (None, 10000000)          0         
                                                                 
 dense_1 (Dense)             (None, 10)                100000010 
                                                                 
 dense_2 (Dense)             (None, 1)                 11        
                                                                 
Total params: 110,000,021
Trainable params: 110,000,021
Non-trainable params: 0
_________________________________________________________________


In [27]:
history = model.fit(X_train, y_train, epochs=10, verbose=False, validation_data=(X_test, y_test), batch_size=10)

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()