In [100]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

from joblib import dump,load


# Assuming your model is trained and named 'model'



In [101]:
df = pd.read_csv("C:\\Users\\User\\Desktop\\output.csv")

In [102]:
df.head()

Unnamed: 0,sentence,type
0,I find comfort in wearing oversized sweaters o...,not sale
1,I appreciate the elegance of a well-tailored s...,not sale
2,"I love the feeling of slipping into a soft, wo...",not sale
3,I enjoy the versatility of a classic white but...,not sale
4,I'm always on the lookout for unique vintage f...,not sale


In [103]:
type_counts = df['type'].value_counts()
print("sale:", type_counts.get('sale',0))
print("not sale:", type_counts.get('not sale',0))

sale: 1001
not sale: 412


In [104]:

# Define functions for cleaning
def remove_punctuation(sentence):
  """Removes punctuation characters from sentence."""
  return "".join([char for char in sentence if char not in string.punctuation])

def to_lowercase(sentence):
  """Converts sentence to lowercase."""
  return sentence.lower()

def stemming(sentence):
  """Reduces words to their root form using Porter Stemmer."""
  porter = nltk.stemmer.PorterStemmer()
  return " ".join([porter.stemmer(word) for word in sentence.split()])

def lemmatization(sentence):
  """Reduces words to their dictionary form using WordNet Lemmatizer."""
  wnl = nltk.WordNetLemmatizer()
  return " ".join([wnl.lemmatize(word) for word in sentence.split()])

def remove_stopwords(sentence):
  """Removes stop words from sentence data."""
  stop_words = stopwords.words('english')  # Download stopwords list (one-time)
  return " ".join([word for word in sentence.split() if word not in stop_words])

In [106]:
# Clean data using defined functions
df["sentence"] = df["sentence"].apply(remove_punctuation)
df["sentence"] = df["sentence"].apply(to_lowercase)
df["sentence"] = df["sentence"].apply(remove_stopwords)  # Added stop word removal

# Choose either stemming or lemmatization (comment out the other)
# df["sentence"] = df["sentence"].apply(stemming)
df["sentence"] = df["sentence"].apply(lemmatization)

# Save cleaned df back to CSV (optional)
df.to_csv("cleaned_dataV2.csv", index=False)

print("Data cleaning complete!")

Data cleaning complete!


In [149]:

X = df["sentence"]  # Text data is our feature
y = df["type"]  # "sale" or "not sale" is the target variable

# Feature Engineering: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_features = vectorizer.fit_transform(X)  # Creates a TF-IDF matrix


In [150]:

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)


In [151]:
# Model Training: Multinomial Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)


MultinomialNB()

In [152]:
# Prediction on Testing Set
y_pred = model.predict(X_test)


In [153]:
# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='sale')
recall = recall_score(y_test, y_pred, pos_label='sale')

precisionNot = precision_score(y_test, y_pred, pos_label='not sale')
recallNot = recall_score(y_test, y_pred, pos_label='not sale')
print("accuracy:",accuracy)
print("precision:",precision)
print("precisionNiot:",precisionNot)
print("recall:",recall)
print("recallnot:",recallNot)

accuracy: 0.9328621908127208
precision: 0.9173913043478261
precisionNiot: 1.0
recall: 1.0
recallnot: 0.7361111111111112


In [157]:


new_sentence = "work with shop"

new_sentence = remove_punctuation(new_sentence.lower())
new_sentence = to_lowercase(new_sentence) 
new_sentence = remove_stopwords(new_sentence) 
new_sentence = lemmatization(new_sentence) 

print(new_sentence)

new_features = vectorizer.transform([new_sentence])
prediction = model.predict(new_features)


if prediction[0] == "sale":
  print("sale")
else:
  print("not sale")



work shop
sale


In [115]:
dump(model, 'my_modelV2.joblib') 

['my_modelV2.joblib']

In [89]:
loaded_model = load('my_model500.joblib')  

In [141]:
from sklearn.feature_selection import SelectKBest, VarianceThreshold
import numpy as np

selector = VarianceThreshold(threshold=0.0)  # Remove features with zero variance
selector.fit(X_train)
X_filtered = selector.transform(X_train)  # Apply filtering on the training data
selector = SelectKBest(k=1000)
selector.fit(X_filtered, y_train)
selected_features = selector.get_support()
X_selected = X_filtered[:, selected_features]


In [147]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_selected, y_train)

from sklearn.metrics import accuracy_score

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation accuracy: {accuracy:.4f}")




ValueError: Found input variables with inconsistent numbers of samples: [1130, 462]