# Text classification using a logistic regression model
The code loads three sentiment analysis datasets from Yelp, Amazon, and IMDb, preprocesses the data, converts the text into numerical features using CountVectorizer, and trains logistic regression models to classify sentiment. The accuracy of each model is evaluated on the respective test sets, and the results are displayed, showing the effectiveness of the models on different sentiment analysis datasets.

In [5]:
import pandas as pd

filepath_dict = {'yelp':   'yelp_lablled.txt',
                 'amazon': 'amazon_cells_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t', encoding='utf-8')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    So there is no way for me to plug it in here i...
label                                                       0
source                                                   yelp
Name: 0, dtype: object


In [6]:
df.shape

(2748, 3)

In [7]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [10]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [11]:
from sklearn.model_selection import train_test_split

df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
   sentences, y, test_size=0.25, random_state=1000)

In [12]:
list(sentences_train)[:10]

['This product is great... it makes working a lot easier I can go to the copier while waiting on hold for something.',
 'I was amazed at the quick arrival of the two original lg cell phone batteries and and at a fraction of the price.',
 'Was not happy.',
 'This is the phone to get for 2005.... I just bought my S710a and all I can say is WOW!',
 'Very unreliable service from T-mobile !',
 'The camera, although rated at an impressive 1.3 megapixels, renders images that fall well below expectations of such a relatively high resolution.',
 "Save your money.... I've had this item for 11 months now.",
 'Motorola finally got the voice quality of a bluetooth headset right.',
 'This device is great in several situations:1.)',
 'My father has the V265, and the battery is dying.']

In [13]:
list(y_train)[:10]

[1, 1, 0, 1, 0, 0, 0, 1, 1, 0]

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<750x1546 sparse matrix of type '<class 'numpy.int64'>'
	with 6817 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.796


In [16]:
for source in df['source'].unique(): # yelp , imdb , amazon
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
