# User Audience Prediction


### Install and Import

In [None]:
# Install required packages

!pip install -q transformers
!pip install -q hazm
!pip install -q clean-text[gpl]
!pip install numpy requests nlpaug



In [None]:
# Import required packages

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils.extmath import safe_sparse_dot

import hazm
from cleantext import clean

import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

import nlpaug.augmenter.word as naw

import os
import re
import json
import copy
import collections

# Train

## Dataset


### Load the data using Pandas

In [None]:
data = pd.read_csv("/content/train.csv", names=['id', 'comment', 'rate'], skiprows=1)

In [None]:
data.head()

Unnamed: 0,id,comment,rate
0,0,شرایط حذف ترم چیه؟,1
1,1,از کجا می تونم با دکتر وحیدی ارتباط برقرار کنم؟,2
2,2,بوفه برداران تا ساعت چند باز است؟,2
3,3,کمترین تعداد واحد چند عدد است؟,1
4,4,سنگ جامد است,5


In [None]:
data = data[['comment', 'rate']]

In [None]:
data.head()

Unnamed: 0,comment,rate
0,شرایط حذف ترم چیه؟,1
1,از کجا می تونم با دکتر وحیدی ارتباط برقرار کنم؟,2
2,بوفه برداران تا ساعت چند باز است؟,2
3,کمترین تعداد واحد چند عدد است؟,1
4,سنگ جامد است,5


### Fixing Conflicts


In [None]:
# print data information
print('data information')
print(data.info(), '\n')

# print missing values information
print('missing values stats')
print(data.isnull().sum(), '\n')

# print some missing values
print('some missing values')
print(data[data['rate'].isnull()].iloc[:5], '\n')

data information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048 entries, 0 to 3047
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  3048 non-null   object
 1   rate     3048 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.8+ KB
None 

missing values stats
comment    0
rate       0
dtype: int64 

some missing values
Empty DataFrame
Columns: [comment, rate]
Index: [] 



In [None]:
# handle some conflicts with the dataset structure
# you can find a reliable solution, for the sake of the simplicity
# I just remove these bad combinations!
data['rate'] = data['rate'].apply(lambda r: r if r < 6 and r > 0 else None)

data = data.dropna(subset=['rate'])
data = data.dropna(subset=['comment'])
data = data.drop_duplicates(subset=['comment'], keep='first')
data = data.reset_index(drop=True)


# previous information after solving the conflicts

# print data information
print('data information')
print(data.info(), '\n')

# print missing values information
print('missing values stats')
print(data.isnull().sum(), '\n')

# print some missing values
print('some missing values')
print(data[data['rate'].isnull()].iloc[:5], '\n')

data information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2912 entries, 0 to 2911
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  2912 non-null   object
 1   rate     2912 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 45.6+ KB
None 

missing values stats
comment    0
rate       0
dtype: int64 

some missing values
Empty DataFrame
Columns: [comment, rate]
Index: [] 





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Normalization / Preprocessing


In [None]:
# calculate the length of comments based on their words
data['comment_len_by_words'] = data['comment'].apply(lambda t: len(hazm.word_tokenize(t)))

In [None]:
min_max_len = data["comment_len_by_words"].min(), data["comment_len_by_words"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')

Min: 1 	Max: 29


In [None]:
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='comment_len_by_words'):
    data_length = data[col].values

    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])

    data_glt_rate = (data_glt / len(data_length)) * 100

    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

In [None]:
data_gl_than(data, 25, 3)

Texts with word length of greater than 3 and less than 25 includes 95.78% of the whole!


In [None]:
minlim, maxlim = 3, 25

In [None]:
# remove comments with the length of fewer than three words
data['comment_len_by_words'] = data['comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
data = data.dropna(subset=['comment_len_by_words'])
data = data.reset_index(drop=True)

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=data['comment_len_by_words']
))

fig.update_layout(
    title_text='Distribution of word counts within comments',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
unique_rates = list(sorted(data['rate'].unique()))
print(f'We have #{len(unique_rates)}: {unique_rates}')

We have #5: [1, 2, 3, 4, 5]


In [None]:
fig = go.Figure()

groupby_rate = data.groupby('rate')['rate'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_rate.index)),
    y=groupby_rate.tolist(),
    text=groupby_rate.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of labels within comments',
    xaxis_title_text='label',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
def rate_to_label(rate):
    if rate == 1:
      return 'amoozesh'
    elif rate == 2:
      return 'information'
    elif rate == 3:
      return 'site/lib'
    elif rate == 4:
      return 'opinions'
    else:
      return 'others'


data['label'] = data['rate'].apply(lambda t: rate_to_label(t))
labels = list(sorted(data['label'].unique()))
data.head()

Unnamed: 0,comment,rate,comment_len_by_words,label
0,شرایط حذف ترم چیه؟,1,5.0,amoozesh
1,از کجا می تونم با دکتر وحیدی ارتباط برقرار کنم؟,2,11.0,information
2,بوفه برداران تا ساعت چند باز است؟,2,8.0,information
3,کمترین تعداد واحد چند عدد است؟,1,7.0,amoozesh
4,سرورای دانشکده مشکل دارن؟,3,5.0,site/lib


In [None]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):
    text = text.strip()
    
    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    # text = re.sub(u"\u200c", " ", text)
    
    return text

In [None]:
# cleaning comments
data['cleaned_comment'] = data['comment'].apply(cleaning)


# calculate the length of comments based on their words
data['cleaned_comment_len_by_words'] = data['cleaned_comment'].apply(lambda t: len(hazm.word_tokenize(t)))

# remove comments with the length of fewer than three words
data['cleaned_comment_len_by_words'] = data['cleaned_comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else len_t)
data = data.dropna(subset=['cleaned_comment_len_by_words'])
data = data.reset_index(drop=True)

data.head()

Unnamed: 0,comment,rate,comment_len_by_words,label,cleaned_comment,cleaned_comment_len_by_words
0,شرایط حذف ترم چیه؟,1,5.0,amoozesh,شرایط حذف ترم چیه؟,5
1,از کجا می تونم با دکتر وحیدی ارتباط برقرار کنم؟,2,11.0,information,از کجا می‌تونم با دکتر وحیدی ارتباط برقرار کنم؟,10
2,بوفه برداران تا ساعت چند باز است؟,2,8.0,information,بوفه برداران تا ساعت چند باز است؟,8
3,کمترین تعداد واحد چند عدد است؟,1,7.0,amoozesh,کمترین تعداد واحد چند عدد است؟,7
4,سرورای دانشکده مشکل دارن؟,3,5.0,site/lib,سرورای دانشکده مشکل دارن؟,5


In [None]:
data = data[['cleaned_comment', 'label']]
data.columns = ['comment', 'label']
data.head()

Unnamed: 0,comment,label
0,شرایط حذف ترم چیه؟,amoozesh
1,از کجا می‌تونم با دکتر وحیدی ارتباط برقرار کنم؟,information
2,بوفه برداران تا ساعت چند باز است؟,information
3,کمترین تعداد واحد چند عدد است؟,amoozesh
4,سرورای دانشکده مشکل دارن؟,site/lib


In [None]:
print(f'We have #{len(labels)} labels: {labels}')

We have #5 labels: ['amoozesh', 'information', 'opinions', 'others', 'site/lib']


### Data Augmentation

#### Synonym Replacement

In [None]:
aug = naw.ContextualWordEmbsAug(model_path='HooshvareLab/bert-fa-base-uncased', action="substitute")
data['synonym_replacement'] = data['comment'].apply(lambda r: aug.augment(r))

#### Random Insertion

In [None]:
aug = naw.ContextualWordEmbsAug(model_path='HooshvareLab/bert-fa-base-uncased', action="insert")
data['random_insertion'] = data['comment'].apply(lambda r: aug.augment(r))

#### Random Swap

In [None]:
aug = naw.RandomWordAug(action='swap')
data['random_swap'] = data['comment'].apply(lambda r: aug.augment(r))

#### Random Deletion

In [None]:
aug = naw.RandomWordAug(action='delete')
data['random_deletion'] = data['comment'].apply(lambda r: aug.augment(r))

### Extract the Result

In [None]:
data.to_csv("final_train_preprocessed.csv")

### Handling Unbalanced Data

In [None]:
data = data[['comment', 'label']]

In [None]:
fig = go.Figure()

groupby_label = data.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_label.index)),
    y=groupby_label.tolist(),
    text=groupby_label.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of label within comments [DATA]',
    xaxis_title_text='Label',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
one_data = data[data['label'] == labels[0]]
two_data = data[data['label'] == labels[1]]
three_data = data[data['label'] == labels[2]]
four_data = data[data['label'] == labels[3]]
five_data = data[data['label'] == labels[4]]

cutting_point = min(len(one_data), len(two_data), len(three_data), len(four_data), len(five_data))

if cutting_point <= len(one_data):
    one_data = one_data.sample(n=cutting_point).reset_index(drop=True)

if cutting_point <= len(two_data):
    two_data = two_data.sample(n=cutting_point).reset_index(drop=True)
  
if cutting_point <= len(three_data):
    three_data = three_data.sample(n=cutting_point).reset_index(drop=True)

if cutting_point <= len(four_data):
    four_data = four_data.sample(n=cutting_point).reset_index(drop=True)

if cutting_point <= len(five_data):
    five_data = five_data.sample(n=cutting_point).reset_index(drop=True)

new_data = pd.concat([one_data, two_data, three_data, four_data, five_data])
new_data = new_data.sample(frac=1).reset_index(drop=True)
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2090 entries, 0 to 2089
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  2090 non-null   object
 1   label    2090 non-null   object
dtypes: object(2)
memory usage: 32.8+ KB


In [None]:
fig = go.Figure()

groupby_label = new_data.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_label.index)),
    y=groupby_label.tolist(),
    text=groupby_label.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of label within comments [NEW DATA]',
    xaxis_title_text='Label',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
new_data.head()

Unnamed: 0,comment,label
0,آیا دانشگاه در ترم بعد باز می‌شود؟,others
1,تعداد و تنوع درس‌های اختیاری کم می‌باشد.,opinions
2,چگونه می‌توانم حذف ترم کنم؟,amoozesh
3,شرایط ارشد بدون کنکور چه است؟,amoozesh
4,بهتر است به اتاق بازی ایر هاکی اضافه کنید.,opinions


## Naive Bayes

In [None]:
new_data['label_id'] = new_data['label'].apply(lambda t: labels.index(t))

In [None]:
corpus = new_data['comment'].values.tolist()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000)
 
X = cv.fit_transform(corpus).toarray()
# y = dataset.iloc[:, 1].values
# y = new_data['label_id'].values.tolist()
y = new_data['label'].values.tolist()

In [None]:
from sklearn. model_selection import train_test_split
 
X_train, X_test, y_train, y_test = train_test_split(
           X, y, test_size = 0.25, random_state = 0)

#### Basic

In [None]:
class BasicSoha(object):

    def get_word_counts(self, words):
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0.0) + 1.0
        return word_counts

    def fit(self, X, Y):
        self.num_messages = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = set()
        n = len(X)
        for l in labels:
            self.num_messages[l] = sum(1 for label in Y if label == l)
            # self.log_class_priors[l] = math.log(self.num_messages[l] / n)
            self.word_counts[l] = {}
        for x, y in zip(X, Y):
            c = y
            counts = self.get_word_counts(x)
            for word, count in counts.items():
                if word not in self.vocab:
                    self.vocab.add(word)
                if word not in self.word_counts[c]:
                    self.word_counts[c][word] = 0.0
                self.word_counts[c][word] += count

    def predict(self, X):
        result = []
        for x in X:
            counts = self.get_word_counts(x)
            scores = {}
            for l in labels:
                scores[l] = 0
            for word, _ in counts.items():
                if word not in self.vocab: continue
                
                # # add Laplace smoothing
                log_w_given = {}
                for l in labels:
                    log_w_given[l] = math.log( (self.word_counts[l].get(word, 0.0) + 1) / (self.num_messages[l] + len(self.vocab)) )
                    scores[l] += log_w_given[l]
            # for l in labels:
            #     scores[l] += self.log_class_priors[l]
            result.append(max(scores, key=scores.get))
        return result

In [None]:
import math 
import string
MNB = BasicSoha()
MNB.fit(X_train, y_train)
pred = MNB.predict(X_test)
true = y_test
accuracy = sum(1 for i in range(len(pred)) if pred[i] == true[i]) / float(len(pred))

##### Cross Validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, f1_score

def cross_validation(model, n_splits, X, y):
    kf = KFold(n_splits=n_splits)
    precisions, accuracies, recalls, f1s = [], [], [], []
    for train_index, test_index in kf.split(X):
        X_train, X_test = np.array(X)[train_index.astype(int)], np.array(X)[test_index.astype(int)]
        y_train, y_test = np.array(y)[train_index.astype(int)], np.array(y)[test_index.astype(int)]
        MNB = model
        MNB.fit(X_train, y_train)
        pred = MNB.predict(X_test)
        true = y_test
        accuracies.append(sum(1 for i in range(len(pred)) if pred[i] == true[i]) / float(len(pred)))
        precisions.append(precision_score(true, pred, average='weighted'))
        recalls.append(recall_score(true, pred, average='weighted'))
        f1s.append(f1_score(true, pred, average='weighted'))
    print(f"average precision= {sum(precisions) / len(precisions)}")
    print(f"average recall= {sum(recalls) / len(recalls)}")
    print(f"average accuracy= {sum(accuracies) / len(accuracies)}")
    print(f"average f1-score= {sum(f1s) / len(f1s)}")

In [None]:
X = new_data['comment'].values.tolist()
y = new_data['label'].values.tolist()
cross_validation(BasicSoha(), 3, X, y)

average precision= 0.363566258201719
average recall= 0.37893105097379576
average accuracy= 0.37893105097379576
average f1-score= 0.2894836725371097



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



#### Multinomial Naive Bayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000)
 
X = cv.fit_transform(corpus).toarray()
# y = dataset.iloc[:, 1].values
y = new_data['label_id'].values.tolist()
# y = new_data['label'].values.tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
           X, y, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
 
classifier_multi = MultinomialNB();
classifier_multi.fit(X_train, y_train)
y_pred = classifier_multi.predict(X_test)

In [None]:
    # BernoulliNB : Naive Bayes classifier for multivariate Bernoulli models.
    # CategoricalNB : Naive Bayes classifier for categorical features.
    # ComplementNB : Complement Naive Bayes classifier.
    # GaussianNB : Gaussian Naive Bayes.

In [None]:
cross_validation(classifier_multi, 3, X, y)

average precision= 0.756290886283873
average recall= 0.7564555539064518
average accuracy= 0.7564555539064518
average f1-score= 0.7528721247459472


#### Gaussian Naive Bayes

In [None]:
# fitting naive bayes to the training set
from sklearn.naive_bayes import GaussianNB
 
classifier = GaussianNB();
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
cross_validation(classifier, 3, X, y)

average precision= 0.6587369084243778
average recall= 0.6397127536184084
average accuracy= 0.6397127536184084
average f1-score= 0.6347225652246411


#### Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
 
bern = BernoulliNB();
bern.fit(X_train, y_train)
y_pred = bern.predict(X_test)

In [None]:
cross_validation(bern, 3, X, y)

average precision= 0.7634056440810616
average recall= 0.7545419064738316
average accuracy= 0.7545419064738316
average f1-score= 0.7564911927897234


#### Complement Naive Bayes

In [None]:
from sklearn.naive_bayes import ComplementNB
 
comp = ComplementNB();
comp.fit(X_train, y_train)
y_pred = comp.predict(X_test)

In [None]:
cross_validation(comp, 3, X, y)

average precision= 0.7470395026132929
average recall= 0.7435430718404547
average accuracy= 0.7435430718404547
average f1-score= 0.7355544481249283


## Train,Validation,Test split (0.1)

In [None]:
new_data['label_id'] = new_data['label'].apply(lambda t: labels.index(t))

train, test = train_test_split(new_data, test_size=0.1, random_state=1, stratify=new_data['label'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['label'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['comment'].values.tolist(), train['label_id'].values.tolist()
x_valid, y_valid = valid['comment'].values.tolist(), valid['label_id'].values.tolist()
x_test, y_test = test['comment'].values.tolist(), test['label_id'].values.tolist()

print(train.shape)
print(valid.shape)
print(test.shape)

(1692, 3)
(189, 3)
(209, 3)


## TensorFlow

In [None]:
from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf

### Configuration

In [None]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'amoozesh': 0, 'information': 1, 'opinions': 2, 'others': 3, 'site/lib': 4}
id2label: {0: 'amoozesh', 1: 'information', 2: 'opinions', 3: 'others', 4: 'site/lib'}


In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "amoozesh",
    "1": "information",
    "2": "opinions",
    "3": "others",
    "4": "site/lib"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "amoozesh": 0,
    "information": 1,
    "opinions": 2,
    "others": 3,
    "site/lib": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



### Input Embeddings / Dataset

In [None]:
class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=128, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)
        
        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]
        
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
    features = glue_convert_examples_to_features(
        examples, 
        tokenizer, 
        maxlen, 
        output_mode=output_mode, 
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features
    
    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [None]:
train_dataset_base, train_examples = make_examples(tokenizer, x_train, y_train, maxlen=128)
valid_dataset_base, valid_examples = make_examples(tokenizer, x_valid, y_valid, maxlen=128)

test_dataset_base, test_examples = make_examples(tokenizer, x_test, y_test, maxlen=128)
[xtest, ytest], test_examples = make_examples(tokenizer, x_test, y_test, maxlen=128, is_tf_dataset=False)

  0%|          | 0/1692 [00:00<?, ?it/s]


This function will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_glue.py



  0%|          | 0/1692 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/209 [00:00<?, ?it/s]

  0%|          | 0/209 [00:00<?, ?it/s]

  0%|          | 0/209 [00:00<?, ?it/s]

  0%|          | 0/209 [00:00<?, ?it/s]

In [None]:
for value in train_dataset_base.take(1):
    print(f'     input_ids: {value[0]["input_ids"]}')
    print(f'attention_mask: {value[0]["attention_mask"]}')
    print(f'token_type_ids: {value[0]["token_type_ids"]}')
    print(f'        target: {value[1]}')

     input_ids: [   2 4597 5880 2802 5631 2786 6807 4908 2789 2867 3344 1350    4    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
attention_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
token_type_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [None]:
def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

In [None]:
train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE)
valid_dataset = get_training_dataset(valid_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(valid_examples) // VALID_BATCH_SIZE

train_steps, valid_steps

(105, 11)

### Model

In [None]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [None]:
model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

Downloading:   0%|          | 0.00/919M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training

In [None]:
%%time

r = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    verbose=1)

final_accuracy = r.history['val_accuracy']
print('FINAL ACCURACY MEAN: ', np.mean(final_accuracy))

### Save in Google Drive

In [None]:
from google.colab import drive
drive._mount('/content/gdrive')

In [None]:
# save the model 
model.save_pretrained(os.path.dirname(OUTPUT_PATH))

In [None]:
!cp -r /content/bert-fa-base-uncased-sentiment-taaghceh/tf_model_AI_tagger_7670.h5 /content/gdrive/MyDrive/

### Evaluation / Prediction

In [None]:
cross_validation(model, 3, x_digi, y)

average precision= 0.7879295783014673
average recall= 0.7729640294617848
average accuracy= 0.7793748162094758
average f1-score= 0.7799127457493047


# Test

## Dataset


### Load the Data Using Pandas

In [None]:
data = pd.read_csv("/content/test.csv", names=['id', 'comment'], skiprows=1)

In [None]:
data.head()

Unnamed: 0,id,comment
0,0,چرا آخر ترم درس ها انقدر فشرده میشوند؟
1,1,فرجه این ترم چقدر است؟
2,2,صندلی های دانشگاه را ابری کنید!
3,3,محل تشکیل امتحان
4,4,دانشکده زیراکس دارد؟


In [None]:
len(data)

762

In [None]:
data = data[['comment']]

In [None]:
data.head()

Unnamed: 0,comment
0,چرا آخر ترم درس ها انقدر فشرده میشوند؟
1,فرجه این ترم چقدر است؟
2,صندلی های دانشگاه را ابری کنید!
3,محل تشکیل امتحان
4,دانشکده زیراکس دارد؟


### Fixing Conflicts


In [None]:
# print data information
print('data information')
print(data.info(), '\n')

# print missing values information
print('missing values stats')
print(data.isnull().sum(), '\n')

# # print some missing values
# print('some missing values')
# print(data[data['rate'].isnull()].iloc[:5], '\n')

data information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762 entries, 0 to 761
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  762 non-null    object
dtypes: object(1)
memory usage: 6.1+ KB
None 

missing values stats
comment    0
dtype: int64 



### Normalization / Preprocessing


In [None]:
# calculate the length of comments based on their words
data['comment_len_by_words'] = data['comment'].apply(lambda t: len(hazm.word_tokenize(t)))

In [None]:
min_max_len = data["comment_len_by_words"].min(), data["comment_len_by_words"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')

Min: 1 	Max: 19


In [None]:
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='comment_len_by_words'):
    data_length = data[col].values

    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])

    data_glt_rate = (data_glt / len(data_length)) * 100

    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

In [None]:
data_gl_than(data, 25, 3)

Texts with word length of greater than 3 and less than 25 includes 95.28% of the whole!


In [None]:
# minlim, maxlim = 3, 25

In [None]:
# # remove comments with the length of fewer than three words
# data['comment_len_by_words'] = data['comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
# # data = data.dropna(subset=['comment_len_by_words'])
# data = data.reset_index(drop=True)

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=data['comment_len_by_words']
))

fig.update_layout(
    title_text='Distribution of word counts within comments',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):
    text = text.strip()
    
    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    # text = re.sub(u"\u200c", " ", text)
    
    return text

In [None]:
len(data)

762

In [None]:
# cleaning comments
data['cleaned_comment'] = data['comment'].apply(cleaning)


# calculate the length of comments based on their words
data['cleaned_comment_len_by_words'] = data['cleaned_comment'].apply(lambda t: len(hazm.word_tokenize(t)))

# remove comments with the length of fewer than three words
data['cleaned_comment_len_by_words'] = data['cleaned_comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else len_t)
data = data.reset_index(drop=True)

data.head()

Unnamed: 0,comment,comment_len_by_words,cleaned_comment,cleaned_comment_len_by_words
0,چرا آخر ترم درس ها انقدر فشرده میشوند؟,9,چرا آخر ترم درس‌ها انقدر فشرده میشوند؟,8
1,فرجه این ترم چقدر است؟,6,فرجه این ترم چقدر است؟,6
2,صندلی های دانشگاه را ابری کنید!,7,صندلی‌های دانشگاه را ابری کنید!,6
3,محل تشکیل امتحان,3,محل تشکیل امتحان,3
4,دانشکده زیراکس دارد؟,4,دانشکده زیراکس دارد؟,4


In [None]:
len(data)

762

In [None]:
data = data[['cleaned_comment']]
data.columns = ['comment']
data.head()

Unnamed: 0,comment
0,چرا آخر ترم درس‌ها انقدر فشرده میشوند؟
1,فرجه این ترم چقدر است؟
2,صندلی‌های دانشگاه را ابری کنید!
3,محل تشکیل امتحان
4,دانشکده زیراکس دارد؟


In [None]:
labels = ['amoozesh', 'information', 'site/lib', 'opinions', 'others']

In [None]:
print(f'We have #{len(labels)} labels: {labels}')

We have #5 labels: ['amoozesh', 'information', 'site/lib', 'opinions', 'others']


In [None]:
x_digi = data['comment'].values.tolist()

## TensorFlow

In [None]:
from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf

### Configuration

In [None]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'amoozesh': 0, 'information': 1, 'site/lib': 2, 'opinions': 3, 'others': 4}
id2label: {0: 'amoozesh', 1: 'information', 2: 'site/lib', 3: 'opinions', 4: 'others'}


In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440 [00:00<?, ?B/s]

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "amoozesh",
    "1": "information",
    "2": "site/lib",
    "3": "opinions",
    "4": "others"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "amoozesh": 0,
    "information": 1,
    "opinions": 3,
    "others": 4,
    "site/lib": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



### Input Embeddings / Dataset

In [None]:
class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=128, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [0] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)
        
        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]
        
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
    features = glue_convert_examples_to_features(
        examples, 
        tokenizer, 
        maxlen, 
        output_mode=output_mode, 
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features
    
    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [None]:
test_dataset_base, test_examples = make_examples(tokenizer, x_digi, maxlen=128)
[xtest, ytest], test_examples = make_examples(tokenizer, x_digi, maxlen=128, is_tf_dataset=False)

  0%|          | 0/762 [00:00<?, ?it/s]


This function will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_glue.py



  0%|          | 0/762 [00:00<?, ?it/s]

  0%|          | 0/762 [00:00<?, ?it/s]

  0%|          | 0/762 [00:00<?, ?it/s]

In [None]:
for value in test_dataset_base.take(1):
    print(f'     input_ids: {value[0]["input_ids"]}')
    print(f'attention_mask: {value[0]["attention_mask"]}')
    print(f'token_type_ids: {value[0]["token_type_ids"]}')
    print(f'        target: {value[1]}')

     input_ids: [    2  3660  5109 12472 33401  6853  9040  3344  1350     4     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
attention_mask: [1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [None]:
def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

## Naive Bayes Prediction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
X = cv.transform(x_digi).toarray()

In [None]:
y_pred = classifier_multi.predict(X)

In [None]:
len(y_pred)

762

In [None]:
df_pred = pd.DataFrame(y_pred)

In [None]:
df_pred.to_csv('/content/y_pred_Naive_AI4.csv')

## Load Pretrained

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "amoozesh",
    "1": "information",
    "2": "site/lib",
    "3": "opinions",
    "4": "others"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "amoozesh": 0,
    "information": 1,
    "opinions": 3,
    "others": 4,
    "site/lib": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



In [None]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [None]:
model = build_model('/content/gdrive/MyDrive/tf_model_AI_tagger_7670.h5', config, learning_rate=LEARNING_RATE)

### Evaluation / Prediction

In [None]:
len(xtest)

In [None]:
predictions_digi = model.predict(xtest)
ypred_digi = predictions_digi[0].argmax(axis=-1).tolist()

### Save as CSV

In [None]:
df_digi = pd.DataFrame(ypred_digi)

In [None]:
df_digi.to_csv('/content/y_pred_AI.csv')