In [None]:
#library imports
import pandas as pd
import numpy as np
import glob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold,train_test_split
from numpy import mean
from numpy import std
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#Assigning the names of txt files in folders to the list
txtfiles_MIS = []
txtfiles_PHIL = []
txtfiles_UNLABELED = []

for file in glob.glob("MIS/*txt"):
  txtfiles_MIS.append(file)
for file in glob.glob("PHIL/*txt"):
  txtfiles_PHIL.append(file)
for file in glob.glob("UNLABELED/*txt"):
  txtfiles_UNLABELED.append(file)

In [None]:
#Opening the list of received text file names and transferring the content to the list
content_MIS = []
content_PHIL = []
content_UNLABELED = []

for document in txtfiles_MIS:
  with open("{}".format(document),"r",encoding= 'unicode_escape') as f:
    lines = f.read()
    content_MIS.append(lines)

for document in txtfiles_PHIL:
  with open("{}".format(document),"r",encoding= 'unicode_escape') as f:
    lines = f.read()
    content_PHIL.append(lines)

for document in txtfiles_UNLABELED:
  with open("{}".format(document),"r",encoding= 'unicode_escape') as f:
    lines = f.read()
    content_UNLABELED.append(lines)

In [None]:
#creating dataframes
MIS = 0
PHIL = 1

df_MIS = pd.DataFrame(content_MIS)
df_MIS["Label"] = MIS
df_MIS.columns = ["Sentence","Label"]   

df_PHIL = pd.DataFrame(content_PHIL)
df_PHIL["Label"] = PHIL
df_PHIL.columns = ["Sentence","Label"]

df_UNLABELED = pd.DataFrame(content_UNLABELED)
df_UNLABELED.columns = ["Sentence"]

In [None]:
#dataframe concatenation
df = pd.concat([df_MIS,df_PHIL])
df = df.reset_index()
df = df.drop(['index'], axis = 1)

In [None]:
X = df["Sentence"]
y = df["Label"]
UNLABELED = df_UNLABELED["Sentence"]

In [None]:
#removing of English stopwords
stop_words: list = stopwords.words('english')

In [None]:
#getting stopwords from text file
with open('StopwordsDict.txt') as file_in:
    StopwordsDict = []
    for line in file_in:
      StopwordsDict.append(line[:-1])

In [None]:
StopwordsDict

['(3+1+0)',
 '(3+2+0)',
 'an',
 'and',
 'to',
 'of',
 'is',
 'the',
 'am',
 'I',
 'or',
 'in',
 'a',
 'by',
 'from']

In [None]:
#removing StopwordsDict
X = X.apply(lambda x: ' '.join([word for word in x.split() if word not in (StopwordsDict)]))

In [None]:
#removing english stopwords from sentences
X = X.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
#lowercase
X = X.str.lower()

In [None]:
#stopwords işleminden sonra
X = X.str.replace('[^\w\s]','')

In [None]:
#lemmatization

'''
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

X_tokenized = X.apply(lemmatize_text)
'''

'\nw_tokenizer = nltk.tokenize.WhitespaceTokenizer()\nlemmatizer = nltk.stem.WordNetLemmatizer()\n\ndef lemmatize_text(text):\n    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]\n\nX_tokenized = X.apply(lemmatize_text)\n'

In [None]:
df["Sentence"][6]

'Object Oriented Programming   (3+0+2) 4 ECTS 6\nFundamentals of object oriented programming; objects, data abstraction, data encapsulation, inheritance, and polymorphism. Exception handling. Graphical user interface programming. Database connection. Multithreading. Program development with object oriented design.'

In [None]:
#an example of structured data (no stopwords, no punctuation, all letters are lowercase, stopwords was applied(english and stopwordsDict.txt))
X[6]

'object oriented programming 302 4 ects 6 fundamentals object oriented programming objects data abstraction data encapsulation inheritance polymorphism exception handling graphical user interface programming database connection multithreading program development object oriented design'

In [None]:
  def train_evaluate_model(type_of_vectorization,gram_value_1,gram_value_2,algorithm_name):
  print("******************Model*******************************")
  print(type_of_vectorization,gram_value_1,gram_value_2,algorithm_name)
  #task1
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
  
  vectorizer_1 = type_of_vectorization(ngram_range=(gram_value_1,gram_value_2))
  vectorizer_1.fit(X_train)
  print("******************Number of Features******************")
  print(f"Amount of feature: {len(vectorizer_1.get_feature_names_out())}")
    
  X_train = vectorizer_1.transform(X_train)
  X_test = vectorizer_1.transform(X_test)

  model_1 = algorithm_name()
  model_1.fit(X_train, y_train)

  y_pred_test = model_1.predict(X_test)
  print("******************Classification Report***************")
  acc_test = accuracy_score(y_test, y_pred_test)
  cls_rep = classification_report(y_test, y_pred_test)
  print(cls_rep)
  print(f"Model performance on the TEST DATA is: {acc_test}")
  print("******************Evaluation of Model*****************")
  cv = KFold(n_splits=3, random_state=1, shuffle=True)
  scores = cross_val_score(model_1, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  print('Accuracy:', mean(scores))


In [None]:
List = [
         [TfidfVectorizer,1,1,DecisionTreeClassifier],
         [TfidfVectorizer,2,2,DecisionTreeClassifier],
         [CountVectorizer,1,1,DecisionTreeClassifier],
         [CountVectorizer,2,2,DecisionTreeClassifier],

         [TfidfVectorizer,1,1,LogisticRegression],
         [TfidfVectorizer,2,2,LogisticRegression],
         [CountVectorizer,1,1,LogisticRegression],
         [CountVectorizer,2,2,LogisticRegression],

         [TfidfVectorizer,1,1,RandomForestClassifier],
         [TfidfVectorizer,2,2,RandomForestClassifier],
         [CountVectorizer,1,1,RandomForestClassifier],
         [CountVectorizer,2,2,RandomForestClassifier]
         ]

In [None]:
for i in List:
  print(train_evaluate_model(i[0],i[1],i[2],i[3]))

******************Model*******************************
<class 'sklearn.feature_extraction.text.TfidfVectorizer'> 1 1 <class 'sklearn.tree._classes.DecisionTreeClassifier'>
******************Number of Features******************
Amount of feature: 665
******************Classification Report***************
              precision    recall  f1-score   support

           0       0.88      0.78      0.82         9
           1       0.83      0.91      0.87        11

    accuracy                           0.85        20
   macro avg       0.85      0.84      0.85        20
weighted avg       0.85      0.85      0.85        20

Model performance on the TEST DATA is: 0.85
******************Evaluation of Model*****************
Accuracy: 0.847985347985348
None
******************Model*******************************
<class 'sklearn.feature_extraction.text.TfidfVectorizer'> 2 2 <class 'sklearn.tree._classes.DecisionTreeClassifier'>
******************Number of Features******************
Amount of

In [None]:
train_evaluate_model(CountVectorizer,1,1,LogisticRegression) #best model(count vectorizer, 1-gram, Decision Tree )

******************Model*******************************
<class 'sklearn.feature_extraction.text.CountVectorizer'> 1 1 <class 'sklearn.linear_model._logistic.LogisticRegression'>
******************Number of Features******************
Amount of feature: 665
******************Classification Report***************
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        11

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Model performance on the TEST DATA is: 1.0
******************Evaluation of Model*****************
Accuracy: 0.9487179487179488


In [None]:
def find_labels(type_of_vectorization,gram_value_1,gram_value_2,algorithm_name):
  vectorizer_2 = type_of_vectorization(ngram_range=(gram_value_1,gram_value_2))
  vectorizer_2.fit(X)

  X_train_2 = vectorizer_2.transform(X) 
  X_test_2 = vectorizer_2.transform(UNLABELED)

  model_2 = algorithm_name()
  model_2 = model_2.fit(X_train_2, y)

  pred_2 = model_2.predict(X_test_2)

  for i in range(len(UNLABELED)):
    print("Label is {} (0:MIS 1:PHIL) for \n".format(pred_2[i]),UNLABELED[i])
    print("************")

In [None]:
find_labels(CountVectorizer,1,1,LogisticRegression)

Label is 1 (0:MIS 1:PHIL) for 
 Advanced Study of Philosophical Texts (3+0+0) 3 ECTS 6
In-depth study of selected texts from a thematic or historical perspective in their original language.
Philosophy of Mathematics (3+0+0) 3 ECTS 6
Foundations of mathematics and philosophical problems in the interpretation of the nature of mathematics.
************
Label is 1 (0:MIS 1:PHIL) for 
 Ethical Issues in Computing (3+0+0) 3 ECTS 5
Introduction to ethical issues related to computer-based information systems: accessibility and censorship; security and privacy; intellectual property: copyrights and plagiarism; the digital divide. Professional and ethical responsibilities of Information Technology (IT) users; sensitivity to end-user needs.
Human Factors in Computing  (3+0+0) 3 ECTS 6
Understanding the effects of the human factors in developing and operating the information systems. Practical issues encountered in man-machine interaction and user-interface design. Current trends in the developmen