# Libraries

In [39]:
# dealing with data
import pandas as pd

# traing algoritms
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
import string
# preproccessing
import nltk 
from nltk.corpus import stopwords

# saving the model
from joblib import dump,load



# Reading data and details

In [112]:
df = pd.read_csv("C:\\Users\\User\\Desktop\\output.csv")

In [113]:
type_counts = df['type'].value_counts()
print("sale:", type_counts.get('sale',0))
print("not sale:", type_counts.get('not sale',0))
print("total",type_counts.get('sale')+type_counts.get('not sale',0))

sale: 1396
not sale: 1351
total 2747


In [114]:
len(df['sentence'].unique())

2747

In [53]:
# duplicate_sentences = df[df.duplicated(subset=['sentence'], keep=False)]
# print("Number of duplicated sentences:", len(duplicate_sentences))

# df_no_duplicates = df.drop_duplicates(subset=['sentence'])
# print("Length of DataFrame after removing duplicates:", len(df_no_duplicates))

# df_no_duplicates.to_csv('output.csv', index=False)


In [37]:
# df_no_duplicates.shape
# df_no_duplicates.describe


# PreProccessing

In [115]:

# Define functions for cleaning
def remove_punctuation(sentence):
  """Removes punctuation characters from sentence, except for #."""
  return "".join([char for char in sentence if char not in string.punctuation or char == '#'])

def to_lowercase(sentence):
  """Converts sentence to lowercase."""
  return sentence.lower()

def stemming(sentence):
  """Reduces words to their root form using Porter Stemmer."""
  porter = nltk.stem.PorterStemmer()
  return " ".join([porter.stem(word) for word in sentence.split()])

def lemmatization(sentence):
  """Reduces words to their dictionary form using WordNet Lemmatizer."""
  wnl = nltk.WordNetLemmatizer()
  return " ".join([wnl.lemmatize(word) for word in sentence.split()])

def remove_stopwords(sentence):
  """Removes stop words """
  stop_words = stopwords.words('english')  # Download stopwords list (one-time)
  return " ".join([word for word in sentence.split() if word not in stop_words])

Apply PreProcessing

In [116]:

df["sentence"] = df["sentence"].apply(remove_punctuation)
df["sentence"] = df["sentence"].apply(to_lowercase)
df["sentence"] = df["sentence"].apply(remove_stopwords)  
df["sentence"] = df["sentence"].apply(stemming)  
df["sentence"] = df["sentence"].apply(lemmatization)

print("Data cleaning complete!")

Data cleaning complete!


In [117]:
duplicate_sentences = df[df.duplicated(subset=['sentence'])]
print("Number of duplicated sentences:", len(duplicate_sentences))
print("length of df before reomving dublicated sentences",len(df['sentence']))
df.drop_duplicates(subset=['sentence'], keep='first', inplace=True)
print("length of df after reomving dublicated sentences",len(df['sentence']))


Number of duplicated sentences: 21
length of df before reomving dublicated sentences 2747
length of df after reomving dublicated sentences 2726


 converting the sentences into a numerical feature matrix and make a weights for important words

In [118]:

X = df["sentence"]
y = df["type"]

vectorizer = TfidfVectorizer(max_features=5000)
X_features = vectorizer.fit_transform(X) 


Train test split


In [119]:

X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)


# Nave base algorithm for training the model

based on Bayes' theorem

--Fast and efficient, particularly for text classification tasks.


In [120]:
model = MultinomialNB()
model.fit(X_train, y_train)


MultinomialNB()

Prediction on Testing Set

In [121]:

y_pred = model.predict(X_test)


# Model Evaluation


First Evaluation with 1000 sales sentences and 300 not sale sentences

In [122]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='sale')
recall = recall_score(y_test, y_pred, pos_label='sale')

precisionNot = precision_score(y_test, y_pred, pos_label='not sale')
recallNot = recall_score(y_test, y_pred, pos_label='not sale')
print("accuracy:",accuracy)
print("precision:",precision)
print("precisionNiot:",precisionNot)
print("recall:",recall)
print("recallnot:",recallNot)

accuracy: 0.9633699633699634
precision: 0.9464285714285714
precisionNiot: 0.981203007518797
recall: 0.9814814814814815
recallnot: 0.9456521739130435


# Real Testing

In [136]:


new_sentence = "good morning from Budapest #fashion"

new_sentence = remove_punctuation(new_sentence.lower())
new_sentence = to_lowercase(new_sentence) 
new_sentence = remove_stopwords(new_sentence) 
new_sentence = stemming(new_sentence)
new_sentence = lemmatization(new_sentence) 

print(new_sentence)

new_features = vectorizer.transform([new_sentence])
prediction = model.predict(new_features)
print(prediction[0])

# for i in df['sentence']:
#     print(i)



good morn budapest #fashion
not sale


# Saving the model

In [124]:
dump(model, 'my_modelV3.joblib') 

['my_modelV3.joblib']

# Loading model

In [135]:
loaded_model = load('../trained_models/my_modelV3.joblib')  

sentence = "good morning from Budapest #fashion"
if len(sentence):
    sentence = remove_punctuation(sentence.lower())
    sentence = to_lowercase(sentence) 
    sentence = remove_stopwords(sentence) 
    sentence = stemming(sentence)
    sentence = lemmatization(sentence) 
    print(sentence)

    new_features = vectorizer.transform([sentence])
    prediction = model.predict(new_features)
    print(prediction[0])

good morn budapest #fashion
not sale


# side test

In [141]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold
import numpy as np

selector = VarianceThreshold(threshold=0.0)  # Remove features with zero variance
selector.fit(X_train)
X_filtered = selector.transform(X_train)  # Apply filtering on the training data
selector = SelectKBest(k=1000)
selector.fit(X_filtered, y_train)
selected_features = selector.get_support()
X_selected = X_filtered[:, selected_features]


In [147]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_selected, y_train)

from sklearn.metrics import accuracy_score

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation accuracy: {accuracy:.4f}")




ValueError: Found input variables with inconsistent numbers of samples: [1130, 462]