# **Import Libraries**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
wine_data = pd.read_csv('/content/OSX_DS_assignment.csv')

In [3]:
# Split the data into features (X) and target variable (y)
X = wine_data['review_description']
y = wine_data['variety']

In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Vectorize the textual data
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)


In [6]:
# Encode the target variable
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

In [7]:
# Train the classifier
classifier = MultinomialNB()
classifier.fit(X_train_vect, y_train_enc)

In [8]:
# Make predictions on the test set
y_pred = classifier.predict(X_test_vect)

In [9]:
# Decode the predicted labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)

In [10]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred_decoded)
print("Accuracy:", accuracy)

Accuracy: 0.4465521413017179


# **Rewrite The Project in Other way**

In [1]:
# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
# Load the dataset
wine_data = pd.read_csv('OSX_DS_assignment.csv')

In [5]:
# Select the relevant columns for the feature set (X) and target variable (y)
selected_columns = ['review_title', 'review_description', 'designation', 'points', 'price',
                    'country', 'province', 'region_1', 'region_2', 'winery']
X = wine_data[selected_columns]
y = wine_data['variety']


In [6]:
# Data preprocessing

# Handling missing values
X = X.fillna('Unknown')

In [7]:
# Text preprocessing
X['review_description'] = X['review_description'].str.lower()
X['review_description'] = X['review_description'].str.replace('[^\w\s]', '')


  X['review_description'] = X['review_description'].str.replace('[^\w\s]', '')


In [8]:
# Feature encoding
label_encoder = LabelEncoder()
X['country'] = label_encoder.fit_transform(X['country'])
X['province'] = label_encoder.fit_transform(X['province'])
X['region_1'] = label_encoder.fit_transform(X['region_1'])
X['region_2'] = label_encoder.fit_transform(X['region_2'])
X['winery'] = label_encoder.fit_transform(X['winery'])
X['designation'] = label_encoder.fit_transform(X['designation'])

In [9]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Vectorize the textual data
vectorizer = TfidfVectorizer()
X_train_text = vectorizer.fit_transform(X_train['review_description'])
X_test_text = vectorizer.transform(X_test['review_description'])


In [12]:
# Combine text features with other numerical features
#X_train_encoded = pd.concat([pd.DataFrame(X_train_text.toarray()), X_train.drop('review_description', axis=1)], axis=1)
#X_test_encoded = pd.concat([pd.DataFrame(X_test_text.toarray()), X_test.drop('review_description', axis=1)], axis=1)


In [13]:
# Encode the target variable
y_encoder = LabelEncoder()
y_train_enc = y_encoder.fit_transform(y_train)
y_test_enc = y_encoder.transform(y_test)

In [14]:
# Oversampling the minority class
oversampler = RandomOverSampler(random_state=42)
X_train_encoded, y_train_enc = oversampler.fit_resample(X_train_text, y_train_enc)


In [15]:
# Create a pipeline for feature selection and classification
pipeline = Pipeline([
    ('feature_selection', SelectKBest(chi2, k=1000)),
    ('classification', GradientBoostingClassifier(random_state=42))
])


In [16]:
# Define the hyperparameters for grid search
param_grid = {
    'feature_selection__k': [500, 1000, 2000],
    'classification__n_estimators': [100, 200, 300],
    'classification__max_depth': [3, 4, 5]
}



In [None]:
# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_encoded, y_train_enc)

# Print the best parameters found by grid search
print("Best Parameters:", grid_search.best_params_)

In [None]:
# Print the best parameters found by grid search
print("Best Parameters:", grid_search.best_params_)

In [None]:
# Make predictions on the test set
y_pred = grid_search.predict(X_test_encoded)

# Decode the predicted labels
y_pred_decoded = y_encoder.inverse_transform(y_pred)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred_decoded)
print("Accuracy:", accuracy)


In [None]:
# Decode the predicted labels
y_pred_decoded = y_encoder.inverse_transform(y_pred)

In [None]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred_decoded)
print("Accuracy:", accuracy)


# **Saving My Best Model**

In [None]:
import joblib

# Save the best model
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_model.pkl')
