In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/nlpgettingstarted/train.csv')
test = pd.read_csv('/kaggle/input/nlpgettingstarted/test.csv')


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)  # Join the words back into a string

train['text'] = train['text'].apply(lambda x: clean_text(x))
train['text'] = train['text'].apply(lambda x: remove_stopwords(x))

test['text'] = test['text'].apply(lambda x: clean_text(x))
test['text'] = test['text'].apply(lambda x: remove_stopwords(x))


In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train['text'])
test_tfidf = tfidf.transform(test["text"])


In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_tfidf, train['target'], test_size=0.2, random_state=2020)

model = LogisticRegression()
model.fit(x_train, y_train)


In [None]:
y_pred = model.predict(x_val)
print(classification_report(y_val, y_pred))


In [None]:
test_pred = model.predict(test_tfidf)


In [None]:
submission = pd.DataFrame({
    "id": test['id'],
    "target": test_pred
})

submission.to_csv('submissionMINE.csv', index=False)