What we have:
1. Reviews : a list of reviws of the movies
2. Labels: 0 (negative) or 4 (positive), sentiment score for the corresponding review
3. Review : Label pairing

What we want:
1. Review -> Classifier Model -> Predicted Sentiment Score

In [None]:
import pandas as pd

# splits your X and Y data into train and test sets
from sklearn.model_selection import train_test_split
# used to represent text via frequencies of its words
# can be replace with TfIdfVectorizer, for example
from sklearn.feature_extraction.text import CountVectorizer
# use module of logistic regression function (more on that can be found in documentation)
from sklearn.linear_model import LogisticRegression

data_basepath = './'


Exploring / reading data

In [None]:
tweet_data = pd.read_csv(data_basepath + 'data.csv', index_col=0)

In [None]:
tweet_data['target'].unique()

In [None]:
tweet_data.head()

Shuffle the data

In [None]:
tweet_data = tweet_data.sample(frac=1)

In [None]:
# take first 50k data samples
data = tweet_data[:100000]

In [None]:
# check proportion of pos / neg examples, should be relatively equal
print(data[data['target'] == 0].shape)
print(data[data['target'] == 4].shape)

In [None]:
# look at the data

data_records = data.to_dict('records')
input_text = [each_item['tweet_proc'] for each_item in data_records]
input_labels = [each_item['target'] for each_item in data_records]

In [None]:
# tweet preprocessing caused some tweets to be empty (e.g., those with punctuation only, for example)
# required to remove them

nan_indices = []

for num, i in enumerate(input_text):
    if type(i) is not str:
        nan_indices.append(num)
        
new_input_text = [item for num, item in enumerate(input_text) if num not in nan_indices]

new_input_labels = [item for num, item in enumerate(input_labels) if num not in nan_indices]

In [None]:
len(new_input_labels)

In [None]:
len(new_input_text)

In [None]:
# split data into train and test
# specity train size (normally 80% of all data)
# specify random state, so that next time one runs this notebook, images are splitted into the same sets

X_train, X_test, y_train, y_test = train_test_split(
                new_input_text,
                new_input_labels,
                train_size=0.90,
                random_state=1875754
)

In [None]:
len(X_train)

In [None]:
len(X_test)

Represent input texts as features:

In [None]:
count_vectorizer = CountVectorizer()

# train representation
X_train_countvec = count_vectorizer.fit_transform(X_train)

# test representation
X_test_countvec = count_vectorizer.transform(X_test)

In [None]:
X_train_countvec

In [None]:
X_test_countvec

In [None]:
# features are represented by all unique words which are found in the tweets
count_vectorizer.get_feature_names()[4000:5000]

In [None]:
# initialise model object and fit data to this object
logistic_model = LogisticRegression()
logistic_model = logistic_model.fit(X=X_train_countvec, y=y_train)

In [None]:
# predict sentiment based on document-term matrix of X_test
y_pred = logistic_model.predict(X_test_countvec)

In [None]:
logistic_model.score(X=X_test_countvec, y=y_test)

In [None]:
y_pred

In [None]:
for i in range(5):
    print(i)

In [None]:
# construct resulting dataframe
output_df = pd.DataFrame(columns=['tweet', 'actual', 'predicted'])
for i in range(len(X_test)):
    output_df.loc[i] = [X_test[i], y_test[i], y_pred[i]]

In [None]:
for num, row in output_df.iterrows():
    print('TWEET', row['tweet'])
    print('SENTIMENT', row['predicted'])
    print('REAL SENTIMENT', row['actual'])
    print()