#Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import requests
import gzip
from io import BytesIO, StringIO
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_fscore_support

#loading data


In [2]:
response = requests.get("https://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz")
compressed_file = BytesIO(response.content)

decompressed_file = gzip.GzipFile(fileobj=compressed_file)

content = decompressed_file.read().decode('utf-8')

data = [line.split() for line in content.splitlines() if line]
df = pd.DataFrame(data, columns=["Word", "POS", "Chunk"])

print(df.head())

         Word  POS Chunk
0  Confidence   NN  B-NP
1          in   IN  B-PP
2         the   DT  B-NP
3       pound   NN  I-NP
4          is  VBZ  B-VP


#cleaning the data

In [3]:
df.isnull().sum()

Word     0
POS      0
Chunk    0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,Word,POS,Chunk
count,211727,211727,211727
unique,19122,44,22
top,",",NN,I-NP
freq,10770,30147,63307


In [5]:
df = df.drop('Chunk', axis=1)

In [6]:
df.describe()

Unnamed: 0,Word,POS
count,211727,211727
unique,19122,44
top,",",NN
freq,10770,30147


#Training

In [7]:
 # dict vectorizer
from sklearn.feature_extraction import DictVectorizer

def dictvec(row):
    data = {
        'W': row['Word'],
        'len': len(row['Word']),
        'cap': row['Word'][0].isupper(),
    }
    return data

df_dict = df.drop('POS', axis=1).apply(dictvec, axis=1)

df_dict.head()

0    {'W': 'Confidence', 'len': 10, 'cap': True}
1            {'W': 'in', 'len': 2, 'cap': False}
2           {'W': 'the', 'len': 3, 'cap': False}
3         {'W': 'pound', 'len': 5, 'cap': False}
4            {'W': 'is', 'len': 2, 'cap': False}
dtype: object

In [8]:
vectorizer = DictVectorizer(sparse=True)
X = vectorizer.fit_transform(df_dict)

le = LabelEncoder()
y = le.fit_transform(df['POS'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
'''
df_encoded = df.copy() #onehot encoding, deep vectorizer

label_encoders = {}

for column in df_encoded.columns:
    le = LabelEncoder()
    df_encoded[column] = le.fit_transform(df_encoded[column])
    label_encoders[column] = le

df_encoded.head()

In [None]:
'''
X = df_encoded.drop('POS', axis=1)
y = df_encoded['POS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
#baysian multinomial
param_grid = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0]}
grid_model = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_model.fit(X_train, y_train)

In [11]:
grid_model.best_params_

{'alpha': 2.0}

In [12]:
best = grid_model.best_estimator_

In [13]:
y_pred = best.predict(X_test)
print('Classification Report:\n', classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00       339
           2       1.00      0.97      0.99       267
           3       1.00      1.00      1.00        52
           4       1.00      1.00      1.00        48
           5       0.97      1.00      0.99      2154
           6       1.00      1.00      1.00      1719
           7       1.00      1.00      1.00       226
           8       1.00      1.00      1.00      1036
           9       1.00      0.84      0.91      1659
          10       0.63      0.99      0.77      3767
          11       0.91      1.00      0.95        48
          12       1.00      0.83      0.91         6
          13       0.99      0.94      0.96      4600
          14       0.94      0.83      0.88      2646
          15       0.86      0.92      0.89       169
          16       0.97      0.96      0.97        81
   

In [14]:
#log reg
logreg = LogisticRegression(max_iter=10000,random_state=42)
logreg.fit(X_train, y_train)

In [15]:
y_pred_logreg = logreg.predict(X_test)

In [16]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred_logreg, average='weighted')

# the performance metrics
print("Weighted Average Precision:", precision)
print("Weighted Average Recall:", recall)
print("Weighted Average F1-Score:", f1_score)

Weighted Average Precision: 0.9331917528950955
Weighted Average Recall: 0.9299579653332074
Weighted Average F1-Score: 0.9288362632164389


  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
#svm
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

In [18]:
y_pred_svm = svm_model.predict(X_test)

In [19]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='weighted')

# the performance metrics
print("Weighted Average Precision:", precision)
print("Weighted Average Recall:", recall)
print("Weighted Average F1-Score:", f1_score)

print(classification_report(y_test,y_pred_svm))

Weighted Average Precision: 0.9248268711280252
Weighted Average Recall: 0.9103339158362065
Weighted Average F1-Score: 0.9120341313202538
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00       339
           2       1.00      0.97      0.99       267
           3       1.00      1.00      1.00        52
           4       1.00      1.00      1.00        48
           5       1.00      1.00      1.00      2154
           6       1.00      1.00      1.00      1719
           7       1.00      1.00      1.00       226
           8       1.00      1.00      1.00      1036
           9       1.00      0.84      0.91      1659
          10       0.97      0.99      0.98      3767
          11       0.91      1.00      0.95        48
          12       1.00      0.83      0.91         6
          13       0.66      1.00      0.80      4600
          14       0.94      0.84      0.89      264

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#svm = SVC(kernel='linear',probability=True) #linear

best_model = VotingClassifier(estimators=[
    ('bayes', grid_model),
    ('logistic', logreg),
    ('svm_model', svm_model)],
    voting='hard') #hard


In [21]:
best_model.fit(X_train, y_train)

predictions = best_model.predict(X_test)

print("Weighted Average Precision:", precision)
print("Weighted Average Recall:", recall)
print("Weighted Average F1-Score:", f1_score)

print(classification_report(y_test,predictions))

Weighted Average Precision: 0.9248268711280252
Weighted Average Recall: 0.9103339158362065
Weighted Average F1-Score: 0.9120341313202538
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00       339
           2       1.00      0.97      0.99       267
           3       1.00      1.00      1.00        52
           4       1.00      1.00      1.00        48
           5       0.98      1.00      0.99      2154
           6       1.00      1.00      1.00      1719
           7       1.00      1.00      1.00       226
           8       1.00      1.00      1.00      1036
           9       1.00      0.84      0.91      1659
          10       0.63      0.99      0.77      3767
          11       0.91      1.00      0.95        48
          12       1.00      0.83      0.91         6
          13       0.95      1.00      0.97      4600
          14       0.94      0.85      0.89      264

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
with open("unlabeled_test_test.txt", "r") as f:
    words = [word for word in f.read().splitlines() if word]

unlabeled_df = pd.DataFrame(words, columns=["Word"])
unlabeled_dict = unlabeled_df.apply(dictvec, axis=1)

X_unlabeled = vectorizer.transform(unlabeled_dict)
X_unlabeled = scaler.transform(X_unlabeled)

predicted_labels = best_model.predict(X_unlabeled)

predicted_POS = le.inverse_transform(predicted_labels)

with open('yousefalrabiah_andreimudrov.test.txt', 'w') as f:
    for word, pos in zip(words, predicted_POS):
        f.write(f"{word} {pos}\n")
