In [None]:
# importing modules
import re
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Load dataset
url = 'https://raw.githubusercontent.com/pieroit/corso_ml_python_youtube_pollo/master/movie_review.csv'
df = pd.read_csv(url)

In [None]:
# Data cleaning
X = df['text']

def clean_text(text):
    stop_words = set(stopwords.words('english'))
    pronouns = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves'}

    # Convert to lowercase
    text = text.lower()

    # Remove all special characters
    processed_text = re.sub(r'\W', ' ', text)

    # Remove all single characters
    processed_text = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_text)

    # Remove single characters from the start
    processed_text = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_text)

    # Substituting multiple spaces with single space
    processed_text = re.sub(r'\s+', ' ', processed_text, flags=re.I)

    # Tokenize the text
    words = word_tokenize(processed_text)

    # Remove stopwords
    words = [word for word in words if word not in stop_words and word not in pronouns]

    # Join the words back to a string
    processed_text = ' '.join(words)

    return processed_text

processed_texts = [clean_text(text) for text in X]

In [None]:
X = processed_texts
y = df['tag']

In [None]:
# Text vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X).toarray()

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [None]:
# Logistic Regression with scikit-learn

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Predictions
p_train = model.predict(X_train)
p_test = model.predict(X_test)

# Calculate accuracies
acc_train = accuracy_score(y_train, p_train)
acc_test = accuracy_score(y_test, p_test)

# Print results
print(f'Training accuracy: {acc_train}, Test accuracy: {acc_test}')