In [14]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, svm
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.utils import shuffle

In [15]:
def replace_emotion_pos(folder_path):
    # List all CSV files in the specified folder
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    # Initialize an empty DataFrame to store the merged data
    merged_df = pd.DataFrame()

    # Iterate through each CSV file
    for file in csv_files:
        # Read the CSV file into a DataFrame
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)

        # Replace 'emotion' with 'positive' in the 'state' column
        df.loc[df['state'] == 'emotion', 'state'] = 'positive'

        # Concatenate the current DataFrame with the merged DataFrame
        merged_df = pd.concat([merged_df, df], ignore_index=True)

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv('merged_positive_data.csv', index=False)


def replace_emotion_neg(folder_path):
    # List all CSV files in the specified folder
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    # Initialize an empty DataFrame to store the merged data
    merged_df = pd.DataFrame()

    # Iterate through each CSV file
    for file in csv_files:
        # Read the CSV file into a DataFrame
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)

        # Replace 'emotion' with 'negative' in the 'state' column
        df.loc[df['state'] == 'emotion', 'state'] = 'negative'

        # Concatenate the current DataFrame with the merged DataFrame
        merged_df = pd.concat([merged_df, df], ignore_index=True)

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv('merged_negative_data.csv', index=False)

if __name__ == "__main__":
    folder_path_pos = '/Users/alexisbader/Desktop/446 Data ALL DATA/Positive'  # Specify the folder containing the CSV files
    folder_path_neg = '/Users/alexisbader/Desktop/446 Data ALL DATA/Negative'  # Specify the folder containing the CSV files
    replace_emotion_pos(folder_path_pos)
    replace_emotion_neg(folder_path_neg)


In [16]:
def process_csv(positive_file, negative_file):
    # Read the positive and negative CSV files into DataFrames
    df_positive = pd.read_csv(positive_file)
    df_negative = pd.read_csv(negative_file)

    # Create new columns for positive, negative, and neutral in the positive DataFrame
    df_positive['positive'] = df_positive['state'].apply(lambda x: 1 if x == 'emotion' else 0)
    df_positive['negative'] = df_positive['state'].apply(lambda x: 1 if x == 'negative' else 0)
    df_positive['neutral'] = df_positive['state'].apply(lambda x: 1 if x == 'neutral' else 0)

    # Create new columns for positive, negative, and neutral in the negative DataFrame
    df_negative['positive'] = df_negative['state'].apply(lambda x: 1 if x == 'positive' else 0)
    df_negative['negative'] = df_negative['state'].apply(lambda x: 1 if x == 'emotion' else 0)
    df_negative['neutral'] = df_negative['state'].apply(lambda x: 1 if x == 'neutral' else 0)

    # Concatenate positive and negative DataFrames into a single DataFrame
    merged_df = pd.concat([df_positive, df_negative], ignore_index=True)

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv('merged_data.csv', index=False)

if __name__ == "__main__":
    positive_file_path = 'merged_positive_data.csv'  # Specify the path to the positive CSV file
    negative_file_path = 'merged_negative_data.csv'  # Specify the path to the negative CSV file

    # Process the positive and negative CSV files and merge the results
    process_csv(positive_file_path, negative_file_path)


In [17]:
df = pd.read_csv('merged_data.csv')
df

Unnamed: 0,state,label,alpha_0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,alpha_6,alpha_7,...,theta_1,theta_2,theta_3,theta_4,theta_5,theta_6,theta_7,positive,negative,neutral
0,neutral,powerByBand,27.878453,1.343236,29.649571,1.761871,13.700364,10.776772,1.710041,1.189413,...,5.460727,32.397776,1.063065,9.578764,20.229411,1.987501,1.819450,0,0,1
1,neutral,powerByBand,19.608959,5.130073,10.880214,2.737353,6.082946,5.972629,1.377526,2.347852,...,5.476059,27.827290,2.438290,14.383222,16.085973,1.406591,4.373358,0,0,1
2,neutral,powerByBand,28.563251,3.111770,28.133244,2.376958,1.841571,12.559124,0.737586,0.951554,...,8.029257,10.394931,2.070959,11.686628,17.597376,0.929804,1.982738,0,0,1
3,neutral,powerByBand,24.398742,1.894088,28.342327,3.619573,11.492029,12.323499,3.463120,3.527180,...,2.421294,27.008356,3.391060,8.643775,19.004300,1.397470,1.774354,0,0,1
4,neutral,powerByBand,21.800419,2.596292,24.948034,3.168503,5.515510,8.481524,3.467250,4.279955,...,2.515014,76.791480,2.591636,15.408173,50.523119,4.200850,6.334520,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11167,negative,powerByBand,2.044767,34.310579,30.704551,3.706996,2.753384,24.249485,37.708444,2.407485,...,98.472183,92.603604,2.556504,3.098782,80.326998,100.382799,2.140483,0,0,0
11168,negative,powerByBand,1.526025,21.850688,19.772270,2.453324,3.386010,18.782360,32.172493,1.069343,...,50.483530,58.070244,2.405282,2.410696,47.920766,66.469629,0.829919,0,0,0
11169,negative,powerByBand,1.417388,7.105501,9.456071,2.428563,3.309378,9.839710,11.388735,2.298787,...,20.615866,24.200763,2.818454,0.841357,29.500371,22.627760,0.793261,0,0,0
11170,negative,powerByBand,3.133698,14.161743,15.535556,6.444870,6.199818,7.570922,16.166410,2.359668,...,28.334749,27.783432,2.668504,0.380132,13.538567,27.790562,1.797018,0,0,0


In [18]:
mask = df['state'] != 'neutral'
df = df.drop(df.index[~mask])
df

Unnamed: 0,state,label,alpha_0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,alpha_6,alpha_7,...,theta_1,theta_2,theta_3,theta_4,theta_5,theta_6,theta_7,positive,negative,neutral
48,positive,powerByBand,25.230890,1.709768,61.406249,3.488279,4.922479,40.016023,1.544270,1.754783,...,4.639598,317.843328,2.147779,5.722057,241.845136,3.939457,3.383076,0,0,0
49,positive,powerByBand,12.414113,2.128211,36.228192,3.406207,4.157073,23.936543,1.786259,1.598960,...,1.485573,89.292922,3.004811,5.653609,76.239415,2.868465,2.227082,0,0,0
50,positive,powerByBand,21.607961,2.312697,66.845332,2.339994,5.833460,42.338080,2.526532,2.897129,...,1.211695,102.435089,3.034522,9.730180,60.773308,1.245747,1.448066,0,0,0
51,positive,powerByBand,36.986980,1.284288,103.208375,0.716365,11.276730,69.497178,1.373770,1.579550,...,3.879824,146.256342,1.229150,4.031118,96.705705,4.620602,3.659563,0,0,0
52,positive,powerByBand,30.647252,1.411346,39.184977,2.288844,8.813337,19.008598,0.735756,1.823774,...,0.800820,100.481930,1.912953,5.183982,68.038003,2.234770,3.114505,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11167,negative,powerByBand,2.044767,34.310579,30.704551,3.706996,2.753384,24.249485,37.708444,2.407485,...,98.472183,92.603604,2.556504,3.098782,80.326998,100.382799,2.140483,0,0,0
11168,negative,powerByBand,1.526025,21.850688,19.772270,2.453324,3.386010,18.782360,32.172493,1.069343,...,50.483530,58.070244,2.405282,2.410696,47.920766,66.469629,0.829919,0,0,0
11169,negative,powerByBand,1.417388,7.105501,9.456071,2.428563,3.309378,9.839710,11.388735,2.298787,...,20.615866,24.200763,2.818454,0.841357,29.500371,22.627760,0.793261,0,0,0
11170,negative,powerByBand,3.133698,14.161743,15.535556,6.444870,6.199818,7.570922,16.166410,2.359668,...,28.334749,27.783432,2.668504,0.380132,13.538567,27.790562,1.797018,0,0,0


In [19]:
state_mapping = {'negative': 0, 'positive': 1}
df['encoded_state'] = df['state'].map(state_mapping)
df


Unnamed: 0,state,label,alpha_0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,alpha_6,alpha_7,...,theta_2,theta_3,theta_4,theta_5,theta_6,theta_7,positive,negative,neutral,encoded_state
48,positive,powerByBand,25.230890,1.709768,61.406249,3.488279,4.922479,40.016023,1.544270,1.754783,...,317.843328,2.147779,5.722057,241.845136,3.939457,3.383076,0,0,0,1
49,positive,powerByBand,12.414113,2.128211,36.228192,3.406207,4.157073,23.936543,1.786259,1.598960,...,89.292922,3.004811,5.653609,76.239415,2.868465,2.227082,0,0,0,1
50,positive,powerByBand,21.607961,2.312697,66.845332,2.339994,5.833460,42.338080,2.526532,2.897129,...,102.435089,3.034522,9.730180,60.773308,1.245747,1.448066,0,0,0,1
51,positive,powerByBand,36.986980,1.284288,103.208375,0.716365,11.276730,69.497178,1.373770,1.579550,...,146.256342,1.229150,4.031118,96.705705,4.620602,3.659563,0,0,0,1
52,positive,powerByBand,30.647252,1.411346,39.184977,2.288844,8.813337,19.008598,0.735756,1.823774,...,100.481930,1.912953,5.183982,68.038003,2.234770,3.114505,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11167,negative,powerByBand,2.044767,34.310579,30.704551,3.706996,2.753384,24.249485,37.708444,2.407485,...,92.603604,2.556504,3.098782,80.326998,100.382799,2.140483,0,0,0,0
11168,negative,powerByBand,1.526025,21.850688,19.772270,2.453324,3.386010,18.782360,32.172493,1.069343,...,58.070244,2.405282,2.410696,47.920766,66.469629,0.829919,0,0,0,0
11169,negative,powerByBand,1.417388,7.105501,9.456071,2.428563,3.309378,9.839710,11.388735,2.298787,...,24.200763,2.818454,0.841357,29.500371,22.627760,0.793261,0,0,0,0
11170,negative,powerByBand,3.133698,14.161743,15.535556,6.444870,6.199818,7.570922,16.166410,2.359668,...,27.783432,2.668504,0.380132,13.538567,27.790562,1.797018,0,0,0,0


In [20]:
df = df.drop(['positive', 'negative', 'neutral'], axis=1)
df

Unnamed: 0,state,label,alpha_0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,alpha_6,alpha_7,...,gamma_7,theta_0,theta_1,theta_2,theta_3,theta_4,theta_5,theta_6,theta_7,encoded_state
48,positive,powerByBand,25.230890,1.709768,61.406249,3.488279,4.922479,40.016023,1.544270,1.754783,...,0.219624,106.296299,4.639598,317.843328,2.147779,5.722057,241.845136,3.939457,3.383076,1
49,positive,powerByBand,12.414113,2.128211,36.228192,3.406207,4.157073,23.936543,1.786259,1.598960,...,0.242088,35.038603,1.485573,89.292922,3.004811,5.653609,76.239415,2.868465,2.227082,1
50,positive,powerByBand,21.607961,2.312697,66.845332,2.339994,5.833460,42.338080,2.526532,2.897129,...,0.211766,37.063912,1.211695,102.435089,3.034522,9.730180,60.773308,1.245747,1.448066,1
51,positive,powerByBand,36.986980,1.284288,103.208375,0.716365,11.276730,69.497178,1.373770,1.579550,...,0.247310,45.607174,3.879824,146.256342,1.229150,4.031118,96.705705,4.620602,3.659563,1
52,positive,powerByBand,30.647252,1.411346,39.184977,2.288844,8.813337,19.008598,0.735756,1.823774,...,0.116252,64.111743,0.800820,100.481930,1.912953,5.183982,68.038003,2.234770,3.114505,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11167,negative,powerByBand,2.044767,34.310579,30.704551,3.706996,2.753384,24.249485,37.708444,2.407485,...,0.138963,1.364970,98.472183,92.603604,2.556504,3.098782,80.326998,100.382799,2.140483,0
11168,negative,powerByBand,1.526025,21.850688,19.772270,2.453324,3.386010,18.782360,32.172493,1.069343,...,0.140873,2.918165,50.483530,58.070244,2.405282,2.410696,47.920766,66.469629,0.829919,0
11169,negative,powerByBand,1.417388,7.105501,9.456071,2.428563,3.309378,9.839710,11.388735,2.298787,...,0.230960,1.541376,20.615866,24.200763,2.818454,0.841357,29.500371,22.627760,0.793261,0
11170,negative,powerByBand,3.133698,14.161743,15.535556,6.444870,6.199818,7.570922,16.166410,2.359668,...,0.142293,2.232883,28.334749,27.783432,2.668504,0.380132,13.538567,27.790562,1.797018,0


In [21]:
df = shuffle(df, random_state=42)

In [22]:
data = df.drop(['state', 'label', 'encoded_state'], axis=1)
target = df['encoded_state']

In [24]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,random_state=109) # 70% training and 30% test

In [25]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear')

#Train the model using the training sets
clf.fit(X_train, y_train)

In [26]:
# Perform 5-fold cross-validation
scores = cross_val_score(clf, X_train, y_train, cv=5)

# Print the accuracy for each fold
print("Cross-validation scores:", scores)

# Print the mean accuracy across all folds
print("Mean accuracy:", scores.mean())

Cross-validation scores: [0.74241355 0.74170783 0.75935074 0.72759351 0.74576271]
Mean accuracy: 0.7433656686960994


In [28]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [29]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7334839073969509


In [30]:
np.unique(y_pred)

xx = np.where(y_pred == y_test)[0]
corr = y_test.iloc[xx]
wrong = y_test.iloc[~xx]
vals_right, counts_right = np.unique(corr, return_counts=True)
vals_wrong, counts_wrong = np.unique(wrong, return_counts=True)
vals_real, counts_real = np.unique(y_test, return_counts=True)

for i, val in enumerate(vals_right):
    print(f"accuracy val: {val} count: {counts_right[i]}/{counts_real[i]} = {counts_right[i]/counts_real[i] * 100}%")
for i, val in enumerate(vals_wrong):
    print(f"val wrong: {val} count: {counts_wrong[i]}")



accuracy val: 0 count: 588/902 = 65.18847006651885%
accuracy val: 1 count: 711/869 = 81.81818181818183%
val wrong: 0 count: 670
val wrong: 1 count: 629


In [31]:
from sklearn.metrics import confusion_matrix

# Assuming y_pred and y_test are already defined

conf_matrix = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[588 314]
 [158 711]]
