In [104]:
# fpr arithematics on arrays
import numpy as np

# for dataframe manipulation
import pandas as pd

# for visualization
import matplotlib as plt

# for splitting the dataset into train and testing
from sklearn.model_selection import train_test_split

# to change the target value to numerics
from sklearn.preprocessing import LabelEncoder

# making standardized X
from sklearn.preprocessing import MinMaxScaler

# accuracy metrics
from sklearn.metrics import accuracy_score, f1_score

# different models for classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

### Loading the dataset

In [105]:
df = pd.read_csv("lab-2-farms.csv")

In [106]:
df.head(2)

Unnamed: 0,Series_reference,Period,Data_value,STATUS,UNITS,MAGNTUDE,Subject,Group,Series_title_1,Series_title_2,Series_title_3,Series_title_4,Series_title_5
0,SNEA.SG01NAC16P10A11,2007.03,526,FINAL,Dollar,6,National Accounts - SNA 2008 - SNE,"Series, GDP(P), Nominal, Actual, Agricultural ...",Output,Wool,,,
1,SNEA.SG01NAC16P10A11,2008.03,482,FINAL,Dollar,6,National Accounts - SNA 2008 - SNE,"Series, GDP(P), Nominal, Actual, Agricultural ...",Output,Wool,,,


### Overview of data

In [107]:
df.describe()

Unnamed: 0,Period,Data_value,MAGNTUDE,Series_title_3,Series_title_4,Series_title_5
count,420.0,420.0,420.0,0.0,0.0,0.0
mean,2013.53,2376.02619,6.0,,,
std,4.035936,4840.409542,0.0,,,
min,2007.03,-244.0,6.0,,,
25%,2010.03,221.0,6.0,,,
50%,2013.53,715.5,6.0,,,
75%,2017.03,2108.25,6.0,,,
max,2020.03,30370.0,6.0,,,


In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Series_reference  420 non-null    object 
 1   Period            420 non-null    float64
 2   Data_value        420 non-null    int64  
 3   STATUS            420 non-null    object 
 4   UNITS             420 non-null    object 
 5   MAGNTUDE          420 non-null    int64  
 6   Subject           420 non-null    object 
 7   Group             420 non-null    object 
 8   Series_title_1    420 non-null    object 
 9   Series_title_2    420 non-null    object 
 10  Series_title_3    0 non-null      float64
 11  Series_title_4    0 non-null      float64
 12  Series_title_5    0 non-null      float64
dtypes: float64(4), int64(2), object(7)
memory usage: 42.8+ KB


In [109]:
df.dropna(how = "any", axis = 1, inplace = True)
df.dropna(how = "any", axis = 0, inplace = True)
df.drop_duplicates(inplace = True)

In [110]:
X = df.iloc[:,1:3]
y = df["Series_title_2"]
print("unique counts: ", y.nunique(),"\n",y.value_counts())


unique counts:  29 
 Series_title_2
Total                                                 28
Wool                                                  14
Value of Change in Livestock - Pigs, Deer and Goat    14
Other input N.E.C.                                    14
Freight                                               14
Repairs and Maintenance                               14
Fuel and Power                                        14
Fertiliser, Lime and Seeds                            14
Weed and Pest Control                                 14
Animal Health and Breeding                            14
Feed and Grazing                                      14
Purchase of Livestock                                 14
Non-Farm Income                                       14
Sale of Live Animals                                  14
Value of Change in Stocks - Timber                    14
Sheep                                                 14
Value of Change in Livestock - Cattle               

In [111]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Fit and transform the categorical data
y = label_encoder.fit_transform(y)
print("Encoded data:", y)


Encoded data: [28 28 28 28 28 28 28 28 28 28 28 28 28 28 20 20 20 20 20 20 20 20 20 20
 20 20 20 20  3  3  3  3  3  3  3  3  3  3  3  3  3  3 15 15 15 15 15 15
 15 15 15 15 15 15 15 15  5  5  5  5  5  5  5  5  5  5  5  5  5  5 16 16
 16 16 16 16 16 16 16 16 16 16 16 16  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  9  9  9  9  9  9  9  9  9  9  9  9  9  9 26 26 26 26 26 26 26 26
 26 26 26 26 26 26 12 12 12 12 12 12 12 12 12 12 12 12 12 12  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 13 13 13 13 13 13 13 13 13 13 13 13 13 13
 24 24 24 24 24 24 24 24 24 24 24 24 24 24 22 22 22 22 22 22 22 22 22 22
 22 22 22 22 25 25 25 25 25 25 25 25 25 25 25 25 25 25 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 19 19 19 19 19 19 19 19 19 19 19 19 19 19 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 21 21 21 21 21 21 21 21 21 21 21 21
 21 21 17 17 17 17 17 17 17 17 17 17 17 17 17 17  6  6  6  6  6  6  6  6
  6  6  6  6  6  6  1  1  1  1  1  1  1  1  1  1  1  1  1  1 27 27 27 27
 27 27 27 27 27 27 27 27 27 27  7  7 

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [113]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(),
}


In [114]:
for name, model in models.items():
    model.fit(X_train, y_train)

# Step 3: Test all the models
results = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[name] = {'Accuracy': accuracy, 'F1 Score': f1}

# Step 4: Compare results
print("Model Comparison:")
for name, metrics in results.items():
    print(f"{name}: Accuracy = {metrics['Accuracy']:.4f}, F1 Score = {metrics['F1 Score']:.4f}")

# Step 5: Decide the best model
best_model = max(results, key=lambda x: results[x]['Accuracy'])
print(f"\nThe best model is {best_model} with an accuracy of {results[best_model]['Accuracy']:.4f}.")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Comparison:
Logistic Regression: Accuracy = 0.3333, F1 Score = 0.2783
Decision Tree: Accuracy = 0.3571, F1 Score = 0.3509
Random Forest: Accuracy = 0.0595, F1 Score = 0.0698
SVM: Accuracy = 0.0595, F1 Score = 0.0711
KNN: Accuracy = 0.2500, F1 Score = 0.2405
XGBoost: Accuracy = 0.2976, F1 Score = 0.2828

The best model is Decision Tree with an accuracy of 0.3571.


In [115]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Step 3: Test all the models
results = {}
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

# Step 4: Compare results
print("Model Comparison:")
for name, metrics in results.items():
    print(f"{name}: Accuracy = {metrics['Accuracy']:.4f}, Precision = {metrics['Precision']:.4f}, Recall = {metrics['Recall']:.4f}, F1 Score = {metrics['F1 Score']:.4f}")

# Step 5: Decide the best model
best_model = max(results, key=lambda x: results[x]['Accuracy'])
print(f"\nThe best model is {best_model} with an accuracy of {results[best_model]['Accuracy']:.4f}.")


Model Comparison:
Logistic Regression: Accuracy = 0.3333, Precision = 0.2960, Recall = 0.3333, F1 Score = 0.2783
Decision Tree: Accuracy = 0.3571, Precision = 0.4044, Recall = 0.3571, F1 Score = 0.3509
Random Forest: Accuracy = 0.0595, Precision = 0.0845, Recall = 0.0595, F1 Score = 0.0698
SVM: Accuracy = 0.0595, Precision = 0.1060, Recall = 0.0595, F1 Score = 0.0711
KNN: Accuracy = 0.2500, Precision = 0.3538, Recall = 0.2500, F1 Score = 0.2405
XGBoost: Accuracy = 0.2976, Precision = 0.4750, Recall = 0.2976, F1 Score = 0.2828

The best model is Decision Tree with an accuracy of 0.3571.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
