In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

# Load CSV master file
csv_file = 'P:\\Desktop\\Thesis data\\master.csv'

# The first row in the CSV file contains the headers
df = pd.read_csv(csv_file)

# Replace NaN values in the 'Text files' column with an empty string
df['Text files'].fillna('', inplace=True)

X = df['Text files']

y = np.asarray(df[df.columns[1:29]])

print(len(y))

# Transform the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=30, max_df=0.9)
X_tfidf = vectorizer.fit_transform(X)

print(X_tfidf)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=101)

# Build a multi-output classifier with Logistic Regression as the base estimator
clf = MultiOutputClassifier(LogisticRegression()).fit(X_train, y_train)

# Now you can use 'clf' to make predictions on your multi-label classification task
# For example:
y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score
print('Accuracy Score: ', accuracy_score(y_test, y_pred))


18132
  (0, 6)	1.0
  (4, 13)	1.0
  (5, 28)	0.36552936143777986
  (5, 18)	0.47877854561164396
  (5, 17)	0.7982226444976914
  (6, 28)	0.36552936143777986
  (6, 18)	0.47877854561164396
  (6, 17)	0.7982226444976914
  (7, 28)	0.36552936143777986
  (7, 18)	0.47877854561164396
  (7, 17)	0.7982226444976914
  (8, 28)	0.36552936143777986
  (8, 18)	0.47877854561164396
  (8, 17)	0.7982226444976914
  (10, 9)	0.3600805265299154
  (10, 28)	0.3410101001080331
  (10, 18)	0.4466626678809128
  (10, 17)	0.744678848336485
  (12, 28)	0.36552936143777986
  (12, 18)	0.47877854561164396
  (12, 17)	0.7982226444976914
  (17, 28)	0.36552936143777986
  (17, 18)	0.47877854561164396
  (17, 17)	0.7982226444976914
  (18, 28)	0.36552936143777986
  :	:
  (18105, 0)	0.6234430606453807
  (18105, 8)	0.168866093370632
  (18105, 10)	0.5491409090267323
  (18105, 4)	0.31225827798662675
  (18105, 18)	0.12262878219152262
  (18110, 8)	0.5162425373833328
  (18110, 10)	0.279797531284881
  (18110, 4)	0.4773042430517945
  (18110, 3)	

In [3]:
# calculate metrics
from sklearn.metrics import classification_report
periods= ['early medieval', 'late mesolithic', 'medieval', 'post medieval',
    'later prehistoric', 'early iron age', 'middle palaeolithic', 'neolithic',
    'late iron age', 'bronze age', 'early bronze age', 'late prehistoric', 'roman',
    'middle iron age', 'late neolithic', 'early neolithic', 'middle bronze age',
    'early mesolithic', 'lower palaeolithic', 'upper palaeolithic', 'late bronze age',
    'palaeolithic', 'early prehistoric', 'mesolithic', '20th century',
    'middle neolithic', 'iron age', 'nil antiquity']
print(classification_report(y_test, y_pred, target_names=periods))


                     precision    recall  f1-score   support

     early medieval       0.00      0.00      0.00       145
    late mesolithic       0.00      0.00      0.00         4
           medieval       0.00      0.00      0.00       962
      post medieval       0.59      0.95      0.73      2118
  later prehistoric       0.00      0.00      0.00       122
     early iron age       0.00      0.00      0.00        18
middle palaeolithic       0.00      0.00      0.00         1
          neolithic       0.00      0.00      0.00        58
      late iron age       0.00      0.00      0.00        59
         bronze age       0.00      0.00      0.00       105
   early bronze age       0.00      0.00      0.00        34
   late prehistoric       0.00      0.00      0.00        53
              roman       0.00      0.00      0.00       527
    middle iron age       0.00      0.00      0.00        36
     late neolithic       0.00      0.00      0.00        29
    early neolithic    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
# Classifiers
print(len(y_train))
from sklearn.svm import SVC
clf = MultiOutputClassifier(SVC()).fit(X_train, y_train)
y_pred_svc = clf.predict(X_test)
print(classification_report(y_test, y_pred_svc, target_names=periods))

14505
                     precision    recall  f1-score   support

     early medieval       0.00      0.00      0.00       145
    late mesolithic       0.00      0.00      0.00         4
           medieval       0.38      0.01      0.02       962
      post medieval       0.60      0.93      0.73      2118
  later prehistoric       0.00      0.00      0.00       122
     early iron age       0.00      0.00      0.00        18
middle palaeolithic       0.00      0.00      0.00         1
          neolithic       0.00      0.00      0.00        58
      late iron age       0.00      0.00      0.00        59
         bronze age       0.00      0.00      0.00       105
   early bronze age       0.00      0.00      0.00        34
   late prehistoric       0.00      0.00      0.00        53
              roman       0.00      0.00      0.00       527
    middle iron age       0.00      0.00      0.00        36
     late neolithic       0.00      0.00      0.00        29
    early neolith

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
from sklearn.ensemble import RandomForestClassifier
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=10)).fit(X_train, y_train)

y_pred_rf = clf.predict(X_test)
print(classification_report(y_test, y_pred_rf, target_names=periods))


                     precision    recall  f1-score   support

     early medieval       0.12      0.01      0.01       145
    late mesolithic       0.00      0.00      0.00         4
           medieval       0.37      0.04      0.08       962
      post medieval       0.60      0.90      0.72      2118
  later prehistoric       0.00      0.00      0.00       122
     early iron age       0.00      0.00      0.00        18
middle palaeolithic       0.00      0.00      0.00         1
          neolithic       0.00      0.00      0.00        58
      late iron age       0.00      0.00      0.00        59
         bronze age       0.00      0.00      0.00       105
   early bronze age       0.00      0.00      0.00        34
   late prehistoric       1.00      0.02      0.04        53
              roman       0.28      0.01      0.03       527
    middle iron age       0.00      0.00      0.00        36
     late neolithic       0.00      0.00      0.00        29
    early neolithic    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
from sklearn.tree import DecisionTreeClassifier
clf = MultiOutputClassifier(DecisionTreeClassifier()).fit(X_train, y_train)
y_pred_dt = clf.predict(X_test)
print(classification_report(y_test, y_pred_dt, target_names=periods))

                     precision    recall  f1-score   support

     early medieval       0.06      0.01      0.01       145
    late mesolithic       0.00      0.00      0.00         4
           medieval       0.28      0.05      0.09       962
      post medieval       0.60      0.88      0.72      2118
  later prehistoric       0.00      0.00      0.00       122
     early iron age       0.00      0.00      0.00        18
middle palaeolithic       0.00      0.00      0.00         1
          neolithic       0.00      0.00      0.00        58
      late iron age       0.00      0.00      0.00        59
         bronze age       0.00      0.00      0.00       105
   early bronze age       0.00      0.00      0.00        34
   late prehistoric       0.00      0.00      0.00        53
              roman       0.22      0.02      0.04       527
    middle iron age       0.00      0.00      0.00        36
     late neolithic       0.00      0.00      0.00        29
    early neolithic    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
