In [1]:
pip install scikit-learn




In [2]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Download the stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the dataset
data = pd.read_excel('data.xls')

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    return " ".join(tokens)

data['preprocessed_sentence'] = data['Text'].apply(preprocess_text)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data['preprocessed_sentence'], data['Niche'], test_size=0.2, random_state=42)

In [4]:
vectorizer = TfidfVectorizer(max_features=4000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [5]:
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_tfidf, y_train)

SVC(C=1, kernel='linear')

In [7]:
y_pred = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.25


In [8]:
def predict_niche(sentence):
    preprocessed_sentence = preprocess_text(sentence)
    tfidf_vector = vectorizer.transform([preprocessed_sentence])
    niche = svm_model.predict(tfidf_vector)[0]
    return niche

new_sentence = "coding is life and passion."
predicted_niche = predict_niche(new_sentence)
print("Predicted Niche:", predicted_niche)

Predicted Niche: Marketing


In [30]:
import joblib

In [31]:
model_file_path = './trained_model.joblib'
joblib.dump(svm_model, model_file_path)

['./trained_model.joblib']