In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [3]:
df = pd.read_csv('exampledataset2.csv')

In [4]:
df.head()

Unnamed: 0,id,text,sentiment,age,name,profession,gender
0,1,I love this product!,Positive,25,Jack,Engineer,Male
1,2,This movie was amazing.,Positive,32,Emily,Teacher,Female
2,3,The service was terrible.,Negative,40,Michael,Salesperson,Male
3,4,The food tasted delicious.,Positive,27,Sophia,Chef,Female
4,5,I would not recommend this hotel.,Negative,35,Ethan,Hotel Manager,Male


In [5]:
text_column = df['text']
sentiment_column = df['sentiment']

In [6]:
X = text_column
y = sentiment_column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform it
X_train_dtm = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_dtm = vectorizer.transform(X_test)

In [8]:
# Print the vocabulary
print("Vocabulary:", vectorizer.get_feature_names_out())

Vocabulary: ['at' 'book' 'complete' 'concert' 'customer' 'deadline' 'delicious'
 'disappointing' 'exceeded' 'excellent' 'expectations' 'experience' 'food'
 'for' 'great' 'had' 'helpful' 'high' 'horrible' 'hotel' 'is' 'item'
 'morning' 'movie' 'must' 'my' 'not' 'of' 'outstanding' 'perfect'
 'performance' 'price' 'product' 'project' 'quality' 'read' 'recommend'
 'service' 'support' 'tasted' 'terrible' 'the' 'their' 'this' 'time'
 'today' 'too' 'traffic' 'unbearable' 'unrealistic' 'very' 'was' 'waste'
 'weather' 'with' 'would']


In [9]:
# Print the document-term matrix for training data
print("Training data (dense format):\n", X_train_dtm.toarray())

Training data (dense format):
 [[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [10]:
# Print the document-term matrix for testing data
print("Testing data (dense format):\n", X_test_dtm.toarray())

Testing data (dense format):
 [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0]]


In [11]:
# Initialize the SVM classifier with custom parameters
svm_classifier = SVC()

In [14]:
# Fit the classifier on the training data
svm_classifier.fit(X_train_dtm, y_train)

In [16]:
# Now, make predictions on the test set
y_predict = svm_classifier.predict(X_test_dtm)

In [17]:
# Output the predictions
print("Predicted labels:", y_predict)

Predicted labels: ['Positive' 'Negative' 'Negative' 'Negative']


In [18]:
# Evaluate the model's performance using accuracy_score
accuracy = accuracy_score(y_test, y_predict)

In [19]:
# Print the accuracy
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.50
