In [48]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import scipy

In [49]:
data = pd.read_csv('data.csv')

In [50]:
df = pd.DataFrame(data, columns=['message', 'fingers', 'tail', 'species'])

In [51]:
# Features and target variable
X = df[['message','fingers','tail']]
y = df['species']
# Convert categorical variable 'flag' to numeric
X['tail'] = X['tail'].map({'yes': 1, 'no': 0})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['tail'] = X['tail'].map({'yes': 1, 'no': 0})


In [52]:
# Vectorization
vectorizer = TfidfVectorizer()
X_message_vec = vectorizer.fit_transform(X['message'])

# Combine the vectorized message with other features
X_final = scipy.sparse.hstack((X_message_vec, X[['fingers', 'tail']].values))

# Split the data into training and test sets (optional)
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)
y_pred

# Train Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

# Regularization with Logistic Regression
# Using L2 regularization (default)
log_reg_l2 = LogisticRegression(penalty='l2', solver='liblinear', random_state=42)
log_reg_l2.fit(X_train.toarray(), y_train)
y_pred_l2 = log_reg_l2.predict(X_test.toarray())
print("L2 Regularized Logistic Regression Report:")
print(classification_report(y_test, y_pred_l2))

# Using L1 regularization
log_reg_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
log_reg_l1.fit(X_train.toarray(), y_train)
y_pred_l1 = log_reg_l1.predict(X_test.toarray())
print("L1 Regularized Logistic Regression Report:")
print(classification_report(y_test, y_pred_l1))

# Load and preprocess new test data
test_data = pd.read_csv('test.csv')
test_data['tail'] = test_data['tail'].map({'yes': 1, 'no': 0})
new_test_vec = vectorizer.transform(test_data['message'])
new_test_final = scipy.sparse.hstack((new_test_vec, test_data[['fingers', 'tail']].values))

# Predict species using Random Forest
predicted_species = rf.predict(new_test_final)
print(f"Predicted Species for test data: {predicted_species}")

test_data = pd.read_csv('test.csv')
# Preprocess the new test data
new_test_df = pd.DataFrame(test_data)
new_test_df['tail'] = new_test_df['tail'].map({'yes': 1, 'no': 0})
new_test_vec = vectorizer.transform(new_test_df['message'])
new_test_final = scipy.sparse.hstack((new_test_vec, new_test_df[['fingers', 'tail']].values))

# Predict species
predicted_species = rf.predict(new_test_final)
print(f"Predicted Species: {predicted_species[0]}")

              precision    recall  f1-score   support

      Aquari       0.67      1.00      0.80         4
       Cybex       0.85      0.92      0.88        12
    Emotivor       0.80      1.00      0.89         8
      Faerix       0.91      0.71      0.80        14
     Florian       1.00      0.78      0.88         9
     Mythron       0.69      0.90      0.78        10
      Nexoon       0.83      0.71      0.77         7
     Quixnar       0.79      0.79      0.79        14
     Sentire       1.00      0.78      0.88         9
     Zorblax       0.77      0.77      0.77        13

    accuracy                           0.82       100
   macro avg       0.83      0.84      0.82       100
weighted avg       0.84      0.82      0.82       100

L2 Regularized Logistic Regression Report:
              precision    recall  f1-score   support

      Aquari       0.67      1.00      0.80         4
       Cybex       0.92      0.92      0.92        12
    Emotivor       0.80      1.00  