In [None]:
!pip install pandas scikit-learn xgboost matplotlib



In [None]:
import pandas as pd
df = pd.read_csv('data.csv')
test_df=pd.read_csv('test.csv')
test_df_copy=test_df.copy()
print(df.head())

                                    message  fingers tail  species
0                        pluvia arbor aquos        4   no   Aquari
1                 cosmix xeno nebuz odbitaz        5  yes  Zorblax
2        solarix glixx novum galaxum quasar        5  yes  Zorblax
3  arbor insectus pesros ekos dootix nimbus        2  yes  Florian
4         mermax drakos lorix epikoz deftax        4   no   Faerix


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# using tf-idf vectorizer
tfidf = TfidfVectorizer(max_features=120)  # optimum came to be around 120 by testing tfid function with multiple values and comparing the training and test accuracy

# transforming the message field into tfidf features
X_text = tfidf.fit_transform(df['message'])
X_test_text=tfidf.transform(test_df['message'])

print(X_text.shape)


(500, 120)


In [None]:
from sklearn.preprocessing import LabelEncoder

# one hot encoding
label_encoder = LabelEncoder()
df['tail_encoded'] = label_encoder.fit_transform(df['tail'])  # encodes yes no as 1/0
test_df['tail_encoded']=label_encoder.transform(test_df['tail'])
df=df.drop('tail',axis=1)

In [None]:
df.head()

Unnamed: 0,message,fingers,species,tail_encoded
0,pluvia arbor aquos,4,Aquari,0
1,cosmix xeno nebuz odbitaz,5,Zorblax,1
2,solarix glixx novum galaxum quasar,5,Zorblax,1
3,arbor insectus pesros ekos dootix nimbus,2,Florian,1
4,mermax drakos lorix epikoz deftax,4,Faerix,0


In [None]:
import numpy as np
from scipy.sparse import hstack

# features
X = hstack((X_text, df[['fingers', 'tail_encoded']].values))
X_test_data=hstack((X_test_text,test_df[['fingers','tail_encoded']].values))
#this stacks the sparce matrix of X-test and dense array side by side to provide a comprehensive view of the data
y = df['species']#selecting the target column from the panda df


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder_species = LabelEncoder()

# transforming the species column into numeric form so that we can train it using xgboost
y_encoded = label_encoder_species.fit_transform(y)

print(y_encoded[:5])  # for getting a preview of the target values


[0 9 9 4 3]


In [None]:
from sklearn.model_selection import train_test_split

# train test splitting of the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")


Training set shape: (400, 122), Test set shape: (100, 122)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42) #we use random forest classifier to train this model

rf.fit(X_train, y_train) #we fit the training data in the model

#evaluation of the accuracy of the model
print(f"Training accuracy: {rf.score(X_train, y_train):.2f}")
print(f"Testing accuracy: {rf.score(X_test, y_test):.2f}")


Training accuracy: 1.00
Testing accuracy: 0.85


In [None]:
#we also try to use xgboost classifier to perform the classification
import xgboost as xgb

# initializing the model
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

# training the model
xgb_model.fit(X_train, y_train)

# evaluating the accuracy of the new model
print(f"Training accuracy: {xgb_model.score(X_train, y_train):.2f}")
print(f"Testing accuracy: {xgb_model.score(X_test, y_test):.2f}")


Training accuracy: 1.00
Testing accuracy: 0.81


As it turns out, the accuracy of the random forest classifier has a better test data accuracy that the xgboost classifier. Hence we will use the random forest classifier to predict the species of the new data that is provided in the question.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# prediction on the test set using the trained random forest classifier 'rf'
y_pred_rf = rf.predict(X_test)

# generating a classification report
print(classification_report(y_test, y_pred_rf))

# printing the Confusion matrix
print(confusion_matrix(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       0.67      1.00      0.80         4
           1       0.86      1.00      0.92        12
           2       0.80      1.00      0.89         8
           3       0.79      0.79      0.79        14
           4       1.00      0.78      0.88         9
           5       0.70      0.70      0.70        10
           6       1.00      0.71      0.83         7
           7       0.92      0.86      0.89        14
           8       1.00      0.78      0.88         9
           9       0.86      0.92      0.89        13

    accuracy                           0.85       100
   macro avg       0.86      0.85      0.85       100
weighted avg       0.86      0.85      0.85       100

[[ 4  0  0  0  0  0  0  0  0  0]
 [ 0 12  0  0  0  0  0  0  0  0]
 [ 0  0  8  0  0  0  0  0  0  0]
 [ 0  0  0 11  0  3  0  0  0  0]
 [ 2  0  0  0  7  0  0  0  0  0]
 [ 0  0  0  3  0  7  0  0  0  0]
 [ 0  2  0  0  0  0  5  0  0  0]
 [ 0  0  0

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred_xgb = xgb_model.predict(X_test) #prediction on the test set using the trained xgboost classifier 'xgb_model'

#generating a classification report
print(classification_report(y_test, y_pred_xgb))

# printing the Confusion matrix
print(confusion_matrix(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.57      1.00      0.73         4
           1       0.92      0.92      0.92        12
           2       0.80      1.00      0.89         8
           3       0.79      0.79      0.79        14
           4       1.00      0.67      0.80         9
           5       0.78      0.70      0.74        10
           6       0.75      0.86      0.80         7
           7       0.72      0.93      0.81        14
           8       1.00      0.78      0.88         9
           9       0.89      0.62      0.73        13

    accuracy                           0.81       100
   macro avg       0.82      0.82      0.81       100
weighted avg       0.83      0.81      0.81       100

[[ 4  0  0  0  0  0  0  0  0  0]
 [ 0 11  0  0  0  0  1  0  0  0]
 [ 0  0  8  0  0  0  0  0  0  0]
 [ 0  0  0 11  0  2  0  1  0  0]
 [ 3  0  0  0  6  0  0  0  0  0]
 [ 0  0  0  3  0  7  0  0  0  0]
 [ 0  1  0  0  0  0  6  0  0  0]
 [ 0  0  0

In [None]:
#As it turns out, the accuracy of the random forest classifier has a better test data accuracy that the xgboost classifier. Hence we will use the random forest classifier to predict the species of the new data that is provided in the question.

In [None]:
y_test_pred=rf.predict(X_test_data) # we predict the values in test.csv using the trained random forest classifier
y_test_pred

array([0, 8, 6, 3, 6, 5, 5, 3, 2, 4, 5, 2, 9, 7, 5, 0, 4, 1, 0, 0, 9, 2,
       7, 6, 5, 6, 0, 8, 2, 5, 1, 2, 5, 7, 5, 2, 0, 4, 0, 7, 2, 7, 5, 6,
       0, 2, 9, 7, 9, 4, 2, 9, 8, 7, 2, 2, 8, 2, 5, 1, 1, 1, 5, 8, 8, 5,
       8, 3, 6, 7, 7, 2, 8, 9, 2, 8, 1, 3, 6, 2, 2, 1, 2, 0, 1, 8, 3, 9,
       6, 3, 8, 5, 7, 7, 8, 4, 5, 6, 2, 5, 1, 8, 8, 0, 6, 7, 0, 2, 9, 2,
       2, 5, 0, 5, 0, 3, 5, 1, 3, 7, 5, 8, 1, 0, 7, 4, 1, 4, 7, 9, 6, 7,
       3, 0, 8, 2, 1, 8, 6, 5, 1, 2, 7, 8, 7, 0, 2, 5, 0, 6, 2, 2, 0, 2,
       0, 8, 0, 7, 4, 2, 2, 0, 6, 1, 6, 2, 0, 6, 5, 1, 5, 8, 9, 2, 1, 8,
       4, 8, 4, 2, 2, 9, 0, 5, 3, 7, 2, 1, 6, 7, 8, 9, 5, 9, 1, 1, 5, 1,
       1, 6, 1, 3, 8, 0, 5, 1, 5, 0, 7, 3, 5, 3, 9, 1, 0, 2, 4, 3, 6, 6,
       0, 8, 6, 2, 0, 8, 2, 5, 0, 1, 3, 7, 0, 1, 1, 5, 1, 7, 3, 7, 7, 6,
       5, 8, 7, 4, 4, 2, 0, 0, 7, 6, 4, 9, 7, 7, 0, 3, 1, 9, 5, 3, 9, 3,
       9, 5, 6, 4, 7, 9, 1, 7, 2, 3, 3, 7, 7, 4, 1, 0, 8, 5, 8, 0, 2, 0,
       1, 0, 0, 1, 7, 2, 2, 4, 5, 6, 5, 7, 2])

In [None]:
species_decoded=label_encoder_species.inverse_transform(y_test_pred)#converting the numerical values back to the text format
species_decoded

array(['Aquari', 'Sentire', 'Nexoon', 'Faerix', 'Nexoon', 'Mythron',
       'Mythron', 'Faerix', 'Emotivor', 'Florian', 'Mythron', 'Emotivor',
       'Zorblax', 'Quixnar', 'Mythron', 'Aquari', 'Florian', 'Cybex',
       'Aquari', 'Aquari', 'Zorblax', 'Emotivor', 'Quixnar', 'Nexoon',
       'Mythron', 'Nexoon', 'Aquari', 'Sentire', 'Emotivor', 'Mythron',
       'Cybex', 'Emotivor', 'Mythron', 'Quixnar', 'Mythron', 'Emotivor',
       'Aquari', 'Florian', 'Aquari', 'Quixnar', 'Emotivor', 'Quixnar',
       'Mythron', 'Nexoon', 'Aquari', 'Emotivor', 'Zorblax', 'Quixnar',
       'Zorblax', 'Florian', 'Emotivor', 'Zorblax', 'Sentire', 'Quixnar',
       'Emotivor', 'Emotivor', 'Sentire', 'Emotivor', 'Mythron', 'Cybex',
       'Cybex', 'Cybex', 'Mythron', 'Sentire', 'Sentire', 'Mythron',
       'Sentire', 'Faerix', 'Nexoon', 'Quixnar', 'Quixnar', 'Emotivor',
       'Sentire', 'Zorblax', 'Emotivor', 'Sentire', 'Cybex', 'Faerix',
       'Nexoon', 'Emotivor', 'Emotivor', 'Cybex', 'Emotivor', 'Aqua

In [None]:
print(type(species_decoded))#making sure that it is a numpy array

<class 'numpy.ndarray'>


In [None]:
species_prediction_df=pd.DataFrame(species_decoded,columns=['species']) #converting the array into a pandas dataframe, and then putting this into prediction.csv
species_prediction_df.to_csv('prediction.csv',index=False)#we only want one column so setting index column to false
#exporting the results containing the various species name x to a file