In [None]:
import pandas as pd

train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [None]:
# getting information about train dataset
train_data.info()

In [None]:
# getting information about test dataset
test_data.info()

In [None]:
# Count the number of data classes in the target column
train_data['recommendation_status'].value_counts()

In [None]:
# Convert "recommended" data to 1 and "not_recommended" data to 0.
train_data["recommendation_status"] = train_data["recommendation_status"].map({"no_idea": 2,"recommended": 1, "not_recommended": 0})

In [None]:
# checking the values stored in "recommendation_starus"
train_data["recommendation_status"].unique()

In [None]:
train_data["recommendation_status"].value_counts()

In [None]:
!pip install hazm

In [None]:
# requirements: pip install hazm
import re
from string import punctuation as ascii_punct
from hazm import Normalizer, Stemmer, WordTokenizer

# initialize hazm objects once
normalizer = Normalizer()
stemmer = Stemmer()
tokenizer = WordTokenizer()

# A default Persian stopword set (sample; you’d better have a more complete file)
DEFAULT_PERSIAN_STOPWORDS = {
    "و","در","به","از","که","این","را","با","برای","است","می","شد","اگر","ها","آن","تا","یا","هم"
}

# A set of common Persian punctuation/signs
PERSIAN_PUNCT = "،؛؟«»—–-•…٬"  # you can extend as you like

def preprocess_text(text: str, stopwords: set | None = None) -> list:
    """
    Preprocess Persian text:
    - Normalize (hazm.Normalizer)
    - Remove numbers (Persian and Latin)
    - Remove punctuation (Latin + Persian)
    - Collapse multiple spaces
    - Tokenize (hazm.WordTokenizer)
    - Remove stopwords (Persian)
    - Stem (hazm.Stemmer)
    Returns: list of processed tokens (stemmed)
    Raises TypeError for non-string input.
    """
    if not isinstance(text, str):
        raise TypeError("Input must be a string")

    if stopwords is None:
        stopwords = DEFAULT_PERSIAN_STOPWORDS

    # 1. Normalization
    text = normalizer.normalize(text)

    # 2. Remove numbers (Latin and Persian)
    #   0-9  : Latin
    #   ۰-۹  : Persian (Unicode)
    text = re.sub(r'[0-9۰-۹]+', ' ', text)

    # 3. Remove punctuation (Latin + Persian) — keeping letters and spaces
    all_punct = ascii_punct + PERSIAN_PUNCT + "`\"'«»“”‚„‹›·•"
    # escape for regex class
    pattern_punct = f"[{re.escape(all_punct)}]"
    text = re.sub(pattern_punct, ' ', text)

    # 4. Remove special noise like zero-width non-joiner/joiner (ZWJ / ZWNJ) — this line is useful for Persian
    text = re.sub(r'[\u200c\u200d]', '', text)  # ZWNJ (‌) and ZWJ

    # 5. Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # 6. Tokenization
    tokens = tokenizer.tokenize(text)

    # 7. Remove stopwords, empty tokens, and tokens that are just spaces (safety)
    filtered = [t for t in tokens if t and t not in stopwords and not re.fullmatch(r'\s+', t)]

    # 8. Stemming — apply stemmer to each token separately
    stemmed = [stemmer.stem(t) for t in filtered]

    # 9. Final: return the list of stemmed tokens
    return stemmed


In [None]:
dataes = train_data['body']

In [None]:
data_processed = dataes.apply(preprocess_text)

In [None]:
train_data["preprocess"] = data_processed
train_data.head()

In [None]:
from gensim.models import Word2Vec

sentences = train_data['preprocess'].to_list()

model = Word2Vec(sentences=sentences,
                    vector_size=200,           # larger embedding size → better representation
                    window=7,                  # context window
                    min_count=2,               # ignore very rare words
                    sg=1,                      # use skip-gram (better for small datasets)
                    negative=10,               # negative sampling
                    epochs=30,                 # more training passes
                    workers=8,                 # number of CPU threads
                    seed=42,
                 )
model.train(sentences, total_examples=len(sentences), epochs=30)

In [None]:
model.wv.most_similar("دوست")

In [None]:
import numpy as np

In [None]:
# Create sentence vectors by averaging word vectors
def sentence_vector(sentence):
    vectors = []
    for word in sentence:
        try:
            vectors.append(model.wv[word])
        except KeyError:
            # Handle words not in vocabulary (e.g., use a zero vector)
            vectors.append(np.zeros(200))  # Assuming vector_size=100
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(200)


In [None]:
sentence_vectors = train_data['preprocess'].apply(sentence_vector)
sentence_vectors

In [None]:
from sklearn.model_selection import train_test_split

# Convert sentence vectors to a NumPy array
X = np.array(sentence_vectors.to_list())

# Assuming 'df["recommendation_status"]' contains target labels
y = train_data["recommendation_status"].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000,
                solver='liblinear',   # robust for small/medium datasets
                C=2.0,                # regularization strength (bigger = less regularization)
                class_weight='balanced',  # handle class imbalance if exists
                random_state=42
            )
logistic_model.fit(X_train, y_train)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': [0.5, 1, 2, 5, 10],              # regularization strength
    'solver': ['liblinear', 'saga'],      # different optimization algorithms
    'penalty': ['l1', 'l2'],              # type of regularization
    'class_weight': [None, 'balanced']    # handle class imbalance
}

base_model = LogisticRegression(max_iter=3000, random_state=42)
grid_search = GridSearchCV(
    base_model,
    param_grid,
    cv=5,                    # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,               # use all cores
    verbose=2
)

grid_search.fit(X_train, y_train)

# Get best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score

y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score

y_pred = logistic_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
def predict_recommendation(comment):
    """
    Predict sentiment class (recommended / not_recommended / no_idea)
    for a given Persian comment using the trained model and preprocessing pipeline.
    """
    # Step 1: Preprocess the input text
    processed_tokens = preprocess_text(comment)
    
    # Step 2: Convert to sentence vector
    vector = sentence_vector(processed_tokens)
    
    # Step 3: Reshape for model prediction (1 sample, 100 features)
    vector = np.array(vector).reshape(1, -1)
    
    # Step 4: Predict numeric label
    predicted_label = logistic_model.predict(vector)[0]
    
    # Step 5: Map numeric labels to readable strings
    label_map = {
        0: "not_recommended",
        1: "recommended",
        2: "no_idea"
    }
    
    return label_map.get(predicted_label, "unknown")


In [None]:
pre = test_data['body'].apply(predict_recommendation)
submission = pd.DataFrame({'class':pre})
submission

In [None]:
import zipfile
import joblib
import os
 
if not os.path.exists(os.path.join(os.getcwd(), 'persian_comments_preprocessing.ipynb')):
    %notebook -e initial.ipynb


def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)

submission.to_csv('submission.csv', index=False)

file_names = ['persian_comments_preprocessing.ipynb', 'submission.csv']
compress(file_names)