In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error


In [4]:
scores = pd.read_csv("../train_scores.csv")
train = pd.read_csv("../train_logs.csv")
train["activity2"] = train['activity'].apply(lambda row: 'Move' if 'Move' in row else row)

In [5]:
def final_word_count(df):
    return df.iloc[-1]['word_count']

def add_features(df):
    df["unsurity"] = ((df['text_change'].str.len() > 50) & (df['activity2']=="Remove/Cut")).apply(lambda x: int(x))
    df["structural_change"] = ((df['text_change'].str.len() > 50) & (df['activity2']=="Replace")).apply(lambda x: int(x))
    df["long_paste"] = ((df['text_change'].str.len() > 50) & (df['activity2']=="Paste")).apply(lambda x: int(x))
    df["unproductive_time"] = (train['activity2'] == "Nonproduction")*train['action_time']
    df["external_help"] = ((df['word_count'] < 10) & (df['activity2']=="Paste")).apply(lambda x: int(x))
    df["pasted_words_number"] = (train['activity2'] == "Paste")*train['text_change'].str.split().apply(len)
    df["large_changes"] = (df['text_change'].str.len() > 50).apply(lambda x: int(x))
    return df

def drop_unrelated_features(df, feat):
    df = df.drop(feat, axis = 1)
    return df

In [6]:
grouped_data = train.groupby(['id', "up_event"])
down = grouped_data["event_id"].count()
pivot = pd.pivot_table(data = down.reset_index(), index = "id", columns = "up_event", values = "event_id" ).fillna(0)
train = add_features(train)
train.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,activity2,unsurity,structural_change,long_paste,unproductive_time,external_help,pasted_words_number,large_changes
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,Nonproduction,0,0,0,31,0,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,Nonproduction,0,0,0,404,0,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,Nonproduction,0,0,0,0,0,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,Input,0,0,0,0,0,0,0
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,Input,0,0,0,0,0,0,0


In [7]:
grouped_data = train.groupby('id')
user = grouped_data.apply(lambda df: pd.Series({
    'Final Word Count': final_word_count(df)
}))

In [8]:
one_hot_encoded = pd.get_dummies(train['activity2'], prefix='activity').astype(int)

# Concatenate the one-hot encoded columns with the original DataFrame
train = pd.concat([train, one_hot_encoded], axis=1)
train = train.drop("activity2", axis = 1)
train = train.drop("up_event", axis = 1)


In [9]:
agg_columns = {
    'event_id':'count',
    'action_time': 'sum',
    'unsurity': 'sum',
    'structural_change': 'sum',
    'long_paste': 'sum',
    'unproductive_time': 'sum',
    'external_help': 'sum',
    'pasted_words_number': 'sum',
    'large_changes': 'sum',
    'activity_Input': 'sum',
    'activity_Move': 'sum',
    'activity_Nonproduction': 'sum',
    'activity_Paste': 'sum',
    'activity_Remove/Cut': 'sum',
    'activity_Replace': 'sum'
}
user_level = train.groupby('id').agg(agg_columns)
user_level.head()

Unnamed: 0_level_0,event_id,action_time,unsurity,structural_change,long_paste,unproductive_time,external_help,pasted_words_number,large_changes,activity_Input,activity_Move,activity_Nonproduction,activity_Paste,activity_Remove/Cut,activity_Replace
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
001519c8,2557,297243,0,0,0,18506,0,0,0,2010,3,120,0,417,7
0022f953,2454,275391,0,0,0,13781,0,0,0,1938,0,254,1,260,1
0042269b,4136,421201,1,4,0,33951,0,0,5,3515,0,175,0,439,7
0059420b,1556,189596,0,0,0,3062,0,1,0,1304,0,99,1,151,1
0075873a,2531,313702,0,0,0,6988,0,0,0,1942,0,72,0,517,0


In [10]:
merged_data = pd.merge(user_level, pivot, on='id', how='inner')
final_merge = pd.merge(merged_data, user, on='id', how='inner')
final_merge = final_merge.reset_index()
final_merge.head()
final_merge.to_csv("train_final.csv", index = None)

In [11]:
final_merge.describe()

Unnamed: 0,event_id,action_time,unsurity,structural_change,long_paste,unproductive_time,external_help,pasted_words_number,large_changes,activity_Input,...,,¡,¿,Â´,Ä±,Å,Ë,â,ä,Final Word Count
count,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,...,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0
mean,3401.820316,333667.5,0.352084,0.163497,0.102388,17909.078106,0.004856,5.253339,0.628895,2722.297046,...,0.000405,0.000405,0.000809,0.000405,0.004047,0.000405,0.000405,0.001619,0.000405,386.1121
std,1578.850387,157520.2,0.887996,0.536878,0.424501,32568.430834,0.075129,42.045249,1.458841,1196.384644,...,0.020117,0.020117,0.028444,0.020117,0.14222,0.020117,0.020117,0.080468,0.020117,171.773394
min,262.0,13452.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,230.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0
25%,2193.5,211148.0,0.0,0.0,0.0,3993.0,0.0,0.0,0.0,1786.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,250.0
50%,3082.0,304951.0,0.0,0.0,0.0,9308.0,0.0,0.0,0.0,2477.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,346.0
75%,4301.0,424814.0,0.0,0.0,0.0,19685.5,0.0,0.0,1.0,3397.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,477.0
max,12876.0,1210508.0,9.0,6.0,6.0,482115.0,2.0,1016.0,17.0,9091.0,...,1.0,1.0,1.0,1.0,5.0,1.0,1.0,4.0,1.0,1326.0


In [13]:
transformed = final_merge.copy()
col = final_merge.columns
transformed[col[1:]] = final_merge[col[1:]].apply(lambda x: x.apply(lambda y: np.log1p(y)))
transformed.to_csv("transformed.csv")
transformed.head()
scores = pd.read_csv("../train_scores.csv")
final_v1 = pd.merge(transformed, scores, on='id', how='inner')
final_v1.head()
final_v1.to_csv("Final_v1.csv")
X = transformed.drop(columns=["id"])
y = scores["score"]

# Split the data into training and testing sets (adjust the test_size parameter as needed)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False, header=True)  # Assuming 'y_train' has column names
y_test.to_csv('y_test.csv', index=False, header=True)  # Assuming 'y_test' has column names

In [14]:
svc_model = SVC(kernel='rbf', random_state=42)

# Fit the model on the training data
svc_model.fit(X_train, y_train.astype(str))

# Predict on the test data
y_pred = svc_model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test.astype(str), y_pred)
print(f"Accuracy Score: {accuracy}")

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy Score: 0.28225806451612906


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [15]:
max_acc = -1
for i in range(1,130):
    for j in range(1,30):
        rf_model = RandomForestClassifier(n_estimators=i, max_depth=j, random_state=42)
        # Fit the model on the training data
        rf_model.fit(X_train, y_train.astype(str))

        # Predict on the test data
        y_pred = rf_model.predict(X_test)

        # Calculate the accuracy score
        accuracy = accuracy_score(y_test.astype(str), y_pred)
        if accuracy > max_acc:
            max_acc = accuracy
            n_estimators = i
            depth = j
        # print(f"The n_estimators are: {i} the max depth is {j} Accuracy Score for this is: {accuracy}")

print(f"Best numbers= n_estimators: {n_estimators} | max depth: {depth} | Accuracy Score: {max_acc}")

feature_importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame(
    {'Feature': X_train.columns, 'Importance': feature_importances}
)

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance DataFrame
print("Feature Importance:")
print(feature_importance_df)

rf_model_best = RandomForestClassifier(n_estimators=87, max_depth=9, random_state=42)
# Fit the model on the training data
rf_model_best.fit(X_train, y_train.astype(str))

# Predict on the test data
y_pred = rf_model_best.predict(X_test)

# Calculate the accuracy score
# accuracy = accuracy_score(y_test.astype(str), y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred.astype(float)))
print(f"Root Mean Squared Error (RMSE): {rmse}")

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "

Best numbers= n_estimators: 41 | max depth: 12 | Accuracy Score: 0.38306451612903225
Feature Importance:
              Feature  Importance
145  Final Word Count    0.060889
9      activity_Input    0.052112
119                 q    0.051254
91              Space    0.047641
1         action_time    0.043282
..                ...         ...
133                     0.000004
88                  S    0.000000
69                 F3    0.000000
142                Ë    0.000000
143               â    0.000000

[146 rows x 2 columns]
Root Mean Squared Error (RMSE): 0.6977787337618728


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [16]:
regressor = LinearRegression()

# Fit the model on the training data
regressor.fit(X_train, y_train)

# Predict on the test data
y_pred = regressor.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")

NameError: name 'LinearRegression' is not defined

In [None]:
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [None]:
class MLPRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(MLPRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 64)
        self.fc4 = nn.Linear(64, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x


X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Define the model
input_size = X_train.shape[1]  # Update with your input size
output_size = 1  # For regression, output size is typically 1
model = MLPRegression(input_size, output_size)

# Define the loss function (Mean Squared Error for regression)
criterion = nn.MSELoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)  # You may need to adjust the learning rate

# Create a DataLoader for batch training
batch_size = 64  # You may need to adjust the batch size
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


val_dataset = TensorDataset(X_test_tensor, y_test_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

epoch_list = []
training_error = []
validation_error = []

num_epochs = 200  # You may need to adjust the number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    epoch_list.append(epoch)
    

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = torch.sqrt(criterion(outputs, targets))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    
    
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            val_loss += torch.sqrt(criterion(outputs, targets))

    average_val_loss = val_loss / len(val_loader)
    
    # print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {average_loss:.4f} | Validation Loss: {average_val_loss:.4f}')
    training_error.append(average_loss)
    validation_error.append(average_val_loss)
    
plt.plot(epoch_list, training_error, label='Training Loss',linestyle='-')
plt.plot(epoch_list, validation_error, label='Validation Loss',linestyle='-')

# Adding labels and title
plt.xlabel('Epoch')
plt.ylabel('Root Mean Squared Error')
plt.title('Training and Validation Loss Over Epochs')

# Adding legend
plt.legend()

print(min(validation_error))

# Display the plot
plt.show()
