## Importings

In [1]:
# Importing basic stuff
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Train test split
from sklearn.model_selection import train_test_split

# Vectorizers
from sklearn.feature_extraction.text import CountVectorizer

# Models
from sklearn.linear_model import LogisticRegression

# evaluation
from sklearn.metrics import classification_report, confusion_matrix

## Reading data

In [2]:
# Naming the columns of the dataset. https://stackoverflow.com/questions/31645466/give-column-name-when-read-csv-file-pandas
colNames = ['polarity', 'title', 'text']

# Loading Dataset and assigning column names
df = pd.read_csv("./data/test.csv", names=colNames, header=None)

# Resizing the dataset, for faster computing time. A random sample from the dataset https://stackoverflow.com/questions/40986230/reduce-dataframe-size-in-pandas
df = df.sample(frac=0.1) # Get 10% of the data

In [3]:
# Looking on the data
(
    df
    #.sample(5)
    #.dtypes
    .info()
    #.loc[:,["polarity"]]
    #.loc[:,["title"]]
    #.loc[:,["text"]]
    #.value_counts()
    #.value_counts(normalize=True) # For seeing the count in %
    #.describe()
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 340754 to 208042
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   polarity  40000 non-null  int64 
 1   title     39999 non-null  object
 2   text      40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [4]:
# Cleaning the data. Removing null values and only take the values that contain info.
# https://stackoverflow.com/questions/13413590/how-to-drop-rows-of-pandas-dataframe-whose-value-in-a-certain-column-is-nan
df.dropna(how='any')
df = df[df['title'].notna()]
df = df[df['text'].notna()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39999 entries, 340754 to 208042
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   polarity  39999 non-null  int64 
 1   title     39999 non-null  object
 2   text      39999 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [5]:
# Assigning values to X and Y
xText = df.text
xTitle = df.title
y = df.polarity

xText.head(1)

340754    This is a great border - we actually used it f...
Name: text, dtype: object

In [6]:
# Train test split of data
X_train, X_test, y_train, y_test = train_test_split(xText, y, test_size=0.33, random_state=42)

In [7]:
# Looking on one example
print("type of X_train: {}".format(type(X_train)))
print("length of X_train: {}".format(len(X_train)))
print("X_train[1]:\n{}".format(X_train.iloc[1500]))

type of X_train: <class 'pandas.core.series.Series'>
length of X_train: 26799
X_train[1]:
clockwork singles is a great album. classic LCB style. i must say that the original version of "Glam Bastard" is awesome. if u are already a fan of lower class brats you'll enjoy hearing your favorites and if you're new to the LCB army this a super album.


## Basics - Countvectorizer plus logistic regression

In [8]:
# Applying Count vectorizer
vect = CountVectorizer()
vect.fit(xText)
X_train = vect.transform(xText)

In [9]:
# Looking on the vectorized features. See page 331 for code
feature_names = vect.get_feature_names_out() # under methods: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

Number of features: 69827
First 20 features:
['00' '000' '0000' '00000' '000city' '000sf' '001' '003' '005821a' '007'
 '00700' '00am' '00and' '00gagne' '00s' '01' '010' '011' '0111' '0113']
Features 20010 to 20030:
['dura' 'durabale' 'durabe' 'durability' 'durable' 'durablecons'
 'durablesights' 'durablity' 'duracel' 'duracell' 'duracells' 'duracion'
 'duradera' 'duran' 'durango' 'duranni' 'durant' 'durante' 'duras'
 'duration']
Every 2000th feature:
['00' '9oz' 'amharic' 'autum' 'biret' 'burroway' 'chemo' 'concerts'
 'curve' 'dident' 'duplicating' 'establecen' 'fightin' 'gamelan' 'gundam'
 'honcho' 'inmate' 'justifiably' 'leonhart' 'manipulation' 'miraculously'
 'nephew' 'ormandy' 'perspective' 'prequel' 'rarest' 'retitled' 'sb'
 'siberia' 'spefically' 'summon' 'thereare' 'ttc' 'vannnt' 'whitley']


In [10]:
# Making a logistic regression model
BaseLR = LogisticRegression()

# Fitting the model
BaseLR.fit(X_train, y_train)


ValueError: Found input variables with inconsistent numbers of samples: [39999, 26799]

In [None]:
# Evaluating results
y_pred = BaseLR.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred))