In [1]:
# --------------------------------------
import pandas as pd
import numpy as np
import string
# --------------------------------------


# --------------------------------------
# ------------- visualizations:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# --------------------------------------


# ---------------------------------------
import sklearn
from sklearn import preprocessing, metrics, pipeline, model_selection, feature_extraction 
from sklearn import naive_bayes, linear_model, svm, neural_network, neighbors, tree
from sklearn import decomposition, cluster

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# ---------------------------------------


# ----------------- output and visualizations: 
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
# show several prints in one cell. This will allow us to condence every trick in one cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
pd.pandas.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# ---------------------------------------

### Text analysis and String manipulation imports:

In [2]:
# --------------------------------------
# --------- Text analysis and Hebrew text analysis imports:
# vectorizers:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# regular expressions:
import re
# --------------------------------------

# more packages:
from collections import Counter

In [3]:
!pip install wn
!python -m wn download omw-he:1.4




[KCached file found: C:\Users\Home\.wn_data\downloads\7ecf10e89326bc0ac26ad94b40fe60a7b6ac3302

[KChecking C:\Users\Home\AppData\Local\Temp\tmprmvhf5i8\omw-he\omw-he.xml
[KSkipping omw-he:1.4 (Hebrew Wordnet); already added



In [4]:
# word net
import wn

In [5]:
!pip install hebrew_tokenizer



In [6]:
# tokenizer:
import hebrew_tokenizer as ht

C:\Users\Home\Downloads\ML_Course\Assignments\Task4


### Reading input files
Reading input files for train annotated corpus (raw text data) corpus and for the test corpus

In [7]:
train_filename = 'annotated_corpus_for_train.csv'
test_filename  = 'corpus_for_test.csv'
df_train = pd.read_csv(train_filename, index_col=None, encoding='utf-8')
df_test  = pd.read_csv(test_filename, index_col=None, encoding='utf-8')

In [8]:
df_train.head(8)
df_train.shape
df_test.head()

Unnamed: 0,story,gender
0,"כשחבר הזמין אותי לחול, לא באמת חשבתי שזה יקרה,...",m
1,לפני שהתגייסתי לצבא עשיתי כל מני מיונים ליחידו...,m
2,מאז שהתחילו הלימודים חלומו של כל סטודנט זה הפנ...,f
3,"כשהייתי ילד, מטוסים היה הדבר שהכי ריתק אותי. ב...",m
4,‏הייתי מדריכה בכפר נוער ומתאם הכפר היינו צריכי...,f
5,לפני כ3 חודשים טסתי לרומא למשך שבוע. טסתי במטו...,f
6,אני כבר שנתיים נשוי והשנה אני ואישתי סוף סוף י...,m
7,השנה התחלנו שיפוץ בדירה שלנו בתל אביב. הדירה ה...,f


(753, 2)

Unnamed: 0,test_example_id,story
0,0,כל קיץ אני והמשפחה נוסעים לארצות הברית לוס אנג...
1,1,"הגעתי לשירות המדינה אחרי שנתיים כפעיל בתנועת ""..."
2,2,אחת האהבות הגדולות שלי אלו הכלבים שלי ושל אישת...
3,3,"רגע הגיוס לצבא היה הרגע הכי משמעותי עבורי, אני..."
4,4,אני הגעתי לברזיל ישר מקולומביה וגם אני עשיתי ע...


In [9]:
df_test.head(3)

Unnamed: 0,test_example_id,story
0,0,כל קיץ אני והמשפחה נוסעים לארצות הברית לוס אנג...
1,1,"הגעתי לשירות המדינה אחרי שנתיים כפעיל בתנועת ""..."
2,2,אחת האהבות הגדולות שלי אלו הכלבים שלי ושל אישת...


Written by: Almog Yosef<br>

## Helper Functions 

In [10]:
## removing digits from the story ##
def remove_digits(story):
    table = str.maketrans("", "", string.digits)
    return story.translate(table)

## removing english from the story ##
def remove_english_words(story):
    return re.sub('[a-zA-Z]+', '', story)

## removing english from the story ##
def remove_punctuation(story):
    table = str.maketrans("", "", string.punctuation)
    return story.translate(table)


## seraching heb words ##
def search_heb_words(dataframe):
    list_tokens = []
    story  = ' '.join(dataframe)
    tokens = ht.tokenize(story)
    clean_tokens = list(filter(lambda x: (x[0]=='HEBREW' and len(x[1])<3),tokens)) ## getting all the heb tokens 
    for tuple in clean_tokens:
        list_tokens.append(tuple[1])
    return list_tokens

## Removing words any words that we have not deleted yet ## 
def remove_words(story,heb_tokens):
    text = [word for word in story.split() if word not in heb_tokens]
    return " ".join(text)


### Processeing the stories, removing english letters, punctuation, digits 

In [11]:
df_train["story"] = df_train["story"].map(lambda punctuation: remove_punctuation(punctuation))
df_train["story"] = df_train["story"].map(lambda alphabet: remove_english_words(alphabet))
df_train["story"] = df_train["story"].map(lambda digit: remove_digits(digit))
heb_tokens = search_heb_words(df_train["story"])
df_train["story"] = df_train["story"].map(lambda story: remove_words(story,heb_tokens))

## Several Classifiers to test

In [12]:
## Perceptron classifier
def Perceptron_train(x_train,y_train):
    pipelinePR = Pipeline([
        ('vec',TfidfVectorizer(token_pattern=r'[א-ת]+',max_df=0.8,ngram_range = (1,7),use_idf=False,sublinear_tf=True)),
         ('norm',preprocessing.Normalizer(norm='l2')),
        ('clf',Perceptron(tol=1e-3, random_state=42, alpha=0.8, max_iter=10))
    ])
    return pipelinePR.fit(x_train,y_train)

## SVC classifier 
def linearSVC_train(x_train,y_train):
    pipelineSVC = Pipeline([
        ('vect',TfidfVectorizer(token_pattern=r'[א-ת]+',max_df=0.8,ngram_range = (1,7),use_idf=False,sublinear_tf=True)),
         ('norm',preprocessing.Normalizer(norm='l2')),
        ('clf',LinearSVC(class_weight ='balanced'))
    ])
    return pipelineSVC.fit(x_train,y_train)


## Naive Base classifier 
def MultinomialNB_train(x_train,y_train):
    pipelineNB = Pipeline([
        ('vec',TfidfVectorizer(token_pattern=r'[א-ת]+',max_df=0.8,ngram_range = (1,7),use_idf=False,sublinear_tf=True)),
         ('norm',preprocessing.Normalizer(norm='l2')),
        ('clf',MultinomialNB(alpha=0.8,fit_prior=False))
    ])
    return pipelineNB.fit(x_train,y_train)


def metrics_classification_report(clf,x_test,y_test):
    predict = clf.predict(x_test)
    df = pd.DataFrame(metrics.confusion_matrix(y_pred=predict ,y_true=y_test),  columns=["f","m"],index=["f","m"])
    return metrics.classification_report(y_test,predict),np.mean(predict == y_test),df

def confusion_matrix_plot():
    print(pd.DataFrame(metrics.confusion_matrix(y_pred=y_pred ,y_true=test_data.target),  columns=df_train.target_names,index=train_data.target_names))


In [13]:
## Splitting the data to X_train X_test y_train y_test
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2,shuffle=False)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df_train["story"],df_train["gender"])

## SVC Prediction

In [14]:
clf = linearSVC_train(X_train,y_train)
metric,report_mean ,df= metrics_classification_report(clf,X_test,y_test)
print(metric)

print('The Avrage Accuracy of SVC is:\n{} %'.format(report_mean * 100))
print('\nThe Confusion Matrix:\n{}'.format(df))

              precision    recall  f1-score   support

           f       0.67      0.14      0.23        29
           m       0.83      0.98      0.90       122

    accuracy                           0.82       151
   macro avg       0.75      0.56      0.56       151
weighted avg       0.80      0.82      0.77       151

The Avrage Accuracy of SVC is:
82.11920529801324 %

The Confusion Matrix:
   f    m
f  4   25
m  2  120


## Naive Base Prediction

In [15]:
clf = MultinomialNB_train(X_train,y_train)
metric,report_mean ,df= metrics_classification_report(clf,X_test,y_test)
print(metric)

print('The Avrage Accuracy of Naive Base is:\n{} %'.format(report_mean * 100))
print('\nThe Confusion Matrix:\n{}'.format(df))

              precision    recall  f1-score   support

           f       0.00      0.00      0.00        29
           m       0.81      1.00      0.89       122

    accuracy                           0.81       151
   macro avg       0.40      0.50      0.45       151
weighted avg       0.65      0.81      0.72       151

The Avrage Accuracy of Naive Base is:
80.79470198675497 %

The Confusion Matrix:
   f    m
f  0   29
m  0  122


## Perceptron Prediction

In [16]:
clf = Perceptron_train(X_train,y_train)
metric,report_mean ,df= metrics_classification_report(clf,X_test,y_test)
print(metric)

print('The Avrage Accuracy of Perceptron is:\n{} %'.format(report_mean * 100))
print('\nThe Confusion Matrix:\n{}'.format(df))

              precision    recall  f1-score   support

           f       0.64      0.62      0.63        29
           m       0.91      0.92      0.91       122

    accuracy                           0.86       151
   macro avg       0.78      0.77      0.77       151
weighted avg       0.86      0.86      0.86       151

The Avrage Accuracy of Perceptron is:
86.09271523178808 %

The Confusion Matrix:
    f    m
f  18   11
m  10  112


## Final Results
Perceptron Prediction Accuracy is higher than other models.\
We'll predict categories of the second file with this model.

## Predicting corpus_for_test.csv

In [17]:
X = pd.concat([df_train["story"], df_test["story"]],ignore_index=True)
X_train = X[:753]
X_test = X[753:]
y_train = df_train["gender"]
clf = Perceptron_train(X_train,y_train)
y_pred = clf.predict(X_test)
df_predicted = pd.DataFrame({"test_example_id" : df_test["test_example_id"] ,"predicted_category" : y_pred })

### Save output to csv
After you're done save your output to the 'classification_results.csv' csv file.<br/>
We assume that the dataframe with your results contain the following columns:
* column 1 (left column): 'test_example_id'  - the same id associated to each of the test stories to be predicted.
* column 2 (right column): 'predicted_category' - the predicted gender value for each of the associated story. 

Assuming your predicted values are in the `df_predicted` dataframe, you should save you're results as following:

### Printing the predicted categories:

In [18]:
df_predicted.head(10)
df_predicted.to_csv('classification_results.csv',index=False)

Unnamed: 0,test_example_id,predicted_category
0,0,m
1,1,m
2,2,m
3,3,m
4,4,f
5,5,m
6,6,m
7,7,f
8,8,m
9,9,m
