In [18]:
import pandas as pd

# Load data
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

# Inspect data
print("Training Data Sample:")
print(train_data.head())
print("\nTesting Data Sample:")
print(test_data.head())

# Check data structure
print("\nTraining Data Info:")
print(train_data.info())
print("\nTesting Data Info:")
print(test_data.info())

# Access reviews and sentiments
reviews = train_data['reviews']
sentiments = train_data['sentiments']
test_reviews = test_data['reviews']

Training Data Sample:
                                             reviews  sentiments
0  I bought this belt for my daughter in-law for ...           1
1  The size was perfect and so was the color.  It...           1
2  Fits and feels good, esp. for doing a swim rac...           1
3  These socks are absolutely the best. I take pi...           1
4  Thank you so much for the speedy delivery they...           1

Testing Data Sample:
                                             reviews
0  I bought 2 sleepers.  sleeper had holes in the...
1  I dare say these are just about the sexiest th...
2  everything about the transaction (price, deliv...
3  Not bad for just a shirt.  Very durable, and m...
4  These are truly wrinkle free and longer than t...

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7401 entries, 0 to 7400
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviews     7401 non-null   object
 

In [19]:
import re

# Load data
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'<[^>]+>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-alphabet characters
    return text

train_data['cleaned_reviews'] = train_data['reviews'].apply(clean_text)
test_data['cleaned_reviews'] = test_data['reviews'].apply(clean_text)
print(train_data[['reviews', 'cleaned_reviews']].head())
print(test_data[['reviews', 'cleaned_reviews']].head())

                                             reviews  \
0  I bought this belt for my daughter in-law for ...   
1  The size was perfect and so was the color.  It...   
2  Fits and feels good, esp. for doing a swim rac...   
3  These socks are absolutely the best. I take pi...   
4  Thank you so much for the speedy delivery they...   

                                     cleaned_reviews  
0  i bought this belt for my daughter inlaw for c...  
1  the size was perfect and so was the color  it ...  
2  fits and feels good esp for doing a swim race ...  
3  these socks are absolutely the best i take pil...  
4  thank you so much for the speedy delivery they...  
                                             reviews  \
0  I bought 2 sleepers.  sleeper had holes in the...   
1  I dare say these are just about the sexiest th...   
2  everything about the transaction (price, deliv...   
3  Not bad for just a shirt.  Very durable, and m...   
4  These are truly wrinkle free and longer than t... 

In [20]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')  # Download tokenizer

train_data['tokenized_reviews'] = train_data['cleaned_reviews'].apply(word_tokenize)
test_data['tokenized_reviews'] = test_data['cleaned_reviews'].apply(word_tokenize)
print(train_data[['tokenized_reviews', 'cleaned_reviews']].head())
print(test_data[['tokenized_reviews', 'cleaned_reviews']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maple\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                   tokenized_reviews  \
0  [i, bought, this, belt, for, my, daughter, inl...   
1  [the, size, was, perfect, and, so, was, the, c...   
2  [fits, and, feels, good, esp, for, doing, a, s...   
3  [these, socks, are, absolutely, the, best, i, ...   
4  [thank, you, so, much, for, the, speedy, deliv...   

                                     cleaned_reviews  
0  i bought this belt for my daughter inlaw for c...  
1  the size was perfect and so was the color  it ...  
2  fits and feels good esp for doing a swim race ...  
3  these socks are absolutely the best i take pil...  
4  thank you so much for the speedy delivery they...  
                                   tokenized_reviews  \
0  [i, bought, sleepers, sleeper, had, holes, in,...   
1  [i, dare, say, these, are, just, about, the, s...   
2  [everything, about, the, transaction, price, d...   
3  [not, bad, for, just, a, shirt, very, durable,...   
4  [these, are, truly, wrinkle, free, and, longer... 

In [21]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

train_data['tokenized_reviews'] = train_data['tokenized_reviews'].apply(lambda tokens: [t for t in tokens if t not in stop_words])
test_data['tokenized_reviews'] = test_data['tokenized_reviews'].apply(lambda tokens: [t for t in tokens if t not in stop_words])
print(train_data[['tokenized_reviews', 'cleaned_reviews']].head())
print(test_data[['tokenized_reviews', 'cleaned_reviews']].head())


                                   tokenized_reviews  \
0  [bought, belt, daughter, inlaw, christmas, loved]   
1    [size, perfect, color, looked, like, web, page]   
2  [fits, feels, good, esp, swim, race, highly, r...   
3  [socks, absolutely, best, take, pilates, class...   
4  [thank, much, speedy, delivery, came, time, re...   

                                     cleaned_reviews  
0  i bought this belt for my daughter inlaw for c...  
1  the size was perfect and so was the color  it ...  
2  fits and feels good esp for doing a swim race ...  
3  these socks are absolutely the best i take pil...  
4  thank you so much for the speedy delivery they...  
                                   tokenized_reviews  \
0  [bought, sleepers, sleeper, holes, arm, pit, a...   
1  [dare, say, sexiest, things, ive, ever, worn, ...   
2  [everything, transaction, price, delivery, tim...   
3  [bad, shirt, durable, matched, teams, colors, ...   
4  [truly, wrinkle, free, longer, average, womans... 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maple\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Define the lemmatization function
def lemmatize_text(text):
    if not isinstance(text, str):
        return ""  # Return empty string if text is not a valid string
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Apply the lemmatization function to your DataFrame, handling missing values
test_data['lemmatized_reviews'] = test_data['tokenized_reviews'].apply(lambda x: lemmatize_text(str(x)))
train_data['lemmatized_reviews'] = train_data['tokenized_reviews'].apply(lambda x: lemmatize_text(str(x)))

# Check the results
print(train_data[['tokenized_reviews', 'lemmatized_reviews']].head())
print(test_data[['tokenized_reviews', 'lemmatized_reviews']].head())

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maple\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                   tokenized_reviews  \
0  [bought, belt, daughter, inlaw, christmas, loved]   
1    [size, perfect, color, looked, like, web, page]   
2  [fits, feels, good, esp, swim, race, highly, r...   
3  [socks, absolutely, best, take, pilates, class...   
4  [thank, much, speedy, delivery, came, time, re...   

                                  lemmatized_reviews  
0  [ 'bought ' , 'belt ' , 'daughter ' , 'inlaw '...  
1  [ 'size ' , 'perfect ' , 'color ' , 'looked ' ...  
2  [ 'fits ' , 'feels ' , 'good ' , 'esp ' , 'swi...  
3  [ 'socks ' , 'absolutely ' , 'best ' , 'take '...  
4  [ 'thank ' , 'much ' , 'speedy ' , 'delivery '...  
                                   tokenized_reviews  \
0  [bought, sleepers, sleeper, holes, arm, pit, a...   
1  [dare, say, sexiest, things, ive, ever, worn, ...   
2  [everything, transaction, price, delivery, tim...   
3  [bad, shirt, durable, matched, teams, colors, ...   
4  [truly, wrinkle, free, longer, average, womans... 

In [39]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Updated configurations for testing
params_to_test = [
    {"C": 0.1, "max_features": 5000},
    {"C": 10, "max_features": 5000},
    {"C": 1.0, "max_features": 7000}  # keep the default C but increase features
]

# Loop through each configuration
for param in params_to_test:
    print(f"\nTesting with C={param['C']} and max_features={param['max_features']}")
    
    # Initialize TF-IDF Vectorizer with the specified max_features
    tfidf_vectorizer = TfidfVectorizer(max_features=param["max_features"], stop_words='english', ngram_range=(1,2))
    
    # Fit and transform the train data
    X_train_full = tfidf_vectorizer.fit_transform(train_data['lemmatized_reviews'])
    X_test = tfidf_vectorizer.transform(test_data['lemmatized_reviews'])
    
    # Split the training data into a training and validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
    
    # Initialize and train the SVM model with the specified C and class balancing
    svm_model = SVC(C=param['C'], class_weight='balanced')
    svm_model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_val_pred = svm_model.predict(X_val)
    
    # Evaluate model performance on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    report = classification_report(y_val, y_val_pred)
    print("Validation Accuracy:", accuracy)
    print("Classification Report on Validation Set:\n", report)
    
    # Predict on the test set (for final submission if needed)
    y_test_pred = svm_model.predict(X_test)
    
    # Save the test set predictions to a CSV file (optional for each configuration)
    submission = pd.DataFrame({"review": test_data['reviews'], "predicted_sentiment": y_test_pred})
    submission_filename = f"submission_C{param['C']}_features{param['max_features']}.csv"
    submission.to_csv(submission_filename, index=False)
    print(f"Submission file '{submission_filename}' created with predictions.")


Testing with C=0.1 and max_features=5000
Validation Accuracy: 0.9054692775151925
Classification Report on Validation Set:
               precision    recall  f1-score   support

           0       0.67      0.64      0.66       209
           1       0.94      0.95      0.95      1272

    accuracy                           0.91      1481
   macro avg       0.81      0.80      0.80      1481
weighted avg       0.90      0.91      0.90      1481

Submission file 'submission_C0.1_features5000.csv' created with predictions.

Testing with C=10 and max_features=5000
Validation Accuracy: 0.9209993247805537
Classification Report on Validation Set:
               precision    recall  f1-score   support

           0       0.93      0.47      0.63       209
           1       0.92      0.99      0.96      1272

    accuracy                           0.92      1481
   macro avg       0.93      0.73      0.79      1481
weighted avg       0.92      0.92      0.91      1481

Submission file 'submi