<a href="https://colab.research.google.com/github/camilla8989/pythonstudy/blob/TextMining/Text_mining_assignment_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

image.png

## RETRIEVE NEWS DATA

In [None]:
# !pip3 install yfinance
# !pip3 install sklearn
# !pip3 install pandas

import urllib.request
import bs4 as bs
import yfinance as yf
import time
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
def get_news(ticker, start_date, end_date):
    current_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    delta = timedelta(days=1)
    headlines = []

    while current_date <= end_date:
        formatted_date = current_date.strftime('%Y-%m-%d')
        query_date = current_date.strftime('%Y%m%d')
        url = f'https://news.google.com/rss/search?q={ticker}+after%3A{query_date}+before%3A{query_date}&hl=en-US&gl=US&ceid=US:en'
        time.sleep(15)

        try:
            doc = urllib.request.urlopen(url).read()
            parsed_doc = bs.BeautifulSoup(doc,'lxml')

            titles = parsed_doc.find_all('title')
            pub_dates = parsed_doc.find_all('pubdate')

            daily_headlines = [{'Headline': title.text, 'Date': formatted_date} for title, pub_date in zip(titles, pub_dates)]
            headlines.extend(daily_headlines)
        except Exception as e:
            print(f"Error on {formatted_date}: {e}")

        current_date += delta

    return headlines

In [None]:
def get_stock_data(ticker, start_date, end_date):
    tickerData = yf.Ticker(ticker)
    hist = tickerData.history(start=start_date, end=end_date)
    hist['PriceChange'] = hist['Close'].diff()
    hist['Label'] = np.where(hist['PriceChange'] >= 0, 1, 0)
    
    hist.index = hist.index.strftime('%Y-%m-%d')
    return hist

In [None]:
ticker = 'AMAZON'
start_date = '2017-07-01'
end_date = '2017-12-31'

In [None]:
headlines = get_news(ticker, start_date, end_date)
headlines

In [None]:
ticker_1 = 'AMZN'
stock_data = get_stock_data(ticker_1, start_date, end_date)
stock_data


In [None]:
headlines_df = pd.DataFrame(headlines)
headlines_df['Date'] = pd.to_datetime(headlines_df['Date']).dt.strftime('%Y-%m-%d')
headlines_df = headlines_df.groupby('Date').agg({'Headline': ' '.join}).reset_index()
headlines_df

Unnamed: 0,Date,Headline
0,2017-07-01,"""AMAZON after:20170701 before:20170701"" - Goog..."
1,2017-07-02,"""AMAZON after:20170702 before:20170702"" - Goog..."
2,2017-07-03,"""AMAZON after:20170703 before:20170703"" - Goog..."
3,2017-07-04,"""AMAZON after:20170704 before:20170704"" - Goog..."
4,2017-07-05,"""AMAZON after:20170705 before:20170705"" - Goog..."
...,...,...
179,2017-12-27,"""AMAZON after:20171227 before:20171227"" - Goog..."
180,2017-12-28,"""AMAZON after:20171228 before:20171228"" - Goog..."
181,2017-12-29,"""AMAZON after:20171229 before:20171229"" - Goog..."
182,2017-12-30,"""AMAZON after:20171230 before:20171230"" - Goog..."


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:
data = pd.merge(headlines_df, stock_data, how='outer', on='Date')
data['Label'].ffill(inplace=True) # Fill missing labels with the most recent available label
data.drop([3], inplace=True)
data.head(100)

In [None]:
print(data)

In [None]:
#CLEANING THE DATA 
import string
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')

# Remove punctuation marks
data['Headline'] = data['Headline'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Convert headlines to lowercase
data['Headline'] = data['Headline'].str.lower()

# Remove stopwords
stop_words = set(stopwords.words('english'))
data['Headline'] = data['Headline'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply stemming
stemmer = PorterStemmer()
data['Headline'] = data['Headline'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Remove non-alphabetic characters
data['Headline'] = data['Headline'].apply(lambda x: re.sub(r'[^a-zA-Z]', ' ', x))

data.head(100)


## SENTIMENT SCORE

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk 
! pip install nltk
nltk.download('vader_lexicon')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
#calculation of sentiment score
def calc_sentiment_score(headline):
    analyser = SentimentIntensityAnalyzer()
    scores = analyser.polarity_scores(headline)
    return scores['compound']

In [None]:
def calc_sentiment_score_normalized(headline):
    analyser = SentimentIntensityAnalyzer()
    scores = analyser.polarity_scores(headline)
    
    # Define minimum and maximum values for the compound score
    min_score = -1
    max_score = 1
    
    # Normalize the compound score to a range of 0 to 1
    normalized_score = (scores['compound'] - min_score) / (max_score - min_score)
    
    return normalized_score

In [None]:
analyser = SentimentIntensityAnalyzer()

for headline in data['Headline']:
    scores = analyser.polarity_scores(headline)
    

data['sentiment_score'] = data['Headline'].apply(calc_sentiment_score)
print(data)

           Date                                           Headline       Open  \
4    2017-07-05  amazon googl news shark tank star like bang sh...  48.076500   
5    2017-07-06  amazon googl news giant undersea river know li...  48.233002   
6    2017-07-07  amazon googl news whole food store evergreen p...  48.477501   
9    2017-07-10  amazon googl news amazon expand chicago offic ...  49.250000   
10   2017-07-11  amazon googl news new airship unlock amazon bb...  49.650002   
..          ...                                                ...        ...   
174  2017-12-22  amazon googl news alteryx data breach expo mil...  58.604000   
178  2017-12-26  amazon googl news last philli blackown booksto...  58.417999   
179  2017-12-27  amazon googl news bezo amazon top bloomberg bi...  58.995499   
180  2017-12-28  amazon googl news epa probe pollut scrap yard ...  59.450001   
181  2017-12-29  amazon googl news presid trump call postag rat...  59.117500   

          High        Low  

In [None]:
#normalized sentiment score

analyser = SentimentIntensityAnalyzer()

for headline in data['Headline']:
    scores = analyser.polarity_scores(headline)
    

data['sentiment_score_normalized'] = data['Headline'].apply(calc_sentiment_score_normalized)
print(data)

           Date                                           Headline       Open  \
4    2017-07-05  amazon googl news shark tank star like bang sh...  48.076500   
5    2017-07-06  amazon googl news giant undersea river know li...  48.233002   
6    2017-07-07  amazon googl news whole food store evergreen p...  48.477501   
9    2017-07-10  amazon googl news amazon expand chicago offic ...  49.250000   
10   2017-07-11  amazon googl news new airship unlock amazon bb...  49.650002   
..          ...                                                ...        ...   
174  2017-12-22  amazon googl news alteryx data breach expo mil...  58.604000   
178  2017-12-26  amazon googl news last philli blackown booksto...  58.417999   
179  2017-12-27  amazon googl news bezo amazon top bloomberg bi...  58.995499   
180  2017-12-28  amazon googl news epa probe pollut scrap yard ...  59.450001   
181  2017-12-29  amazon googl news presid trump call postag rat...  59.117500   

          High        Low  

# AGGREGATION ALL DATA AND SCORES

The compound score is our target sentiment score which is ranged from -1 to 1. We can set a threshold that if the compound score is above 0.05, the sentiment is overall positive. The higher the score, the better the sentiment. In opposite, if the compound score is lower than -0.05, the sentiment is negative. If the score is between -0.05 to 0.05, the sentiment is neutral (neither positive nor negative).

In [None]:
print(data)

           Date                                           Headline       Open  \
4    2017-07-05  amazon googl news shark tank star like bang sh...  48.076500   
5    2017-07-06  amazon googl news giant undersea river know li...  48.233002   
6    2017-07-07  amazon googl news whole food store evergreen p...  48.477501   
9    2017-07-10  amazon googl news amazon expand chicago offic ...  49.250000   
10   2017-07-11  amazon googl news new airship unlock amazon bb...  49.650002   
..          ...                                                ...        ...   
174  2017-12-22  amazon googl news alteryx data breach expo mil...  58.604000   
178  2017-12-26  amazon googl news last philli blackown booksto...  58.417999   
179  2017-12-27  amazon googl news bezo amazon top bloomberg bi...  58.995499   
180  2017-12-28  amazon googl news epa probe pollut scrap yard ...  59.450001   
181  2017-12-29  amazon googl news presid trump call postag rat...  59.117500   

          High        Low  

In [None]:
#cleaning the nan values
data.dropna(inplace=True)
print(data)

           Date                                           Headline       Open  \
4    2017-07-05  amazon googl news shark tank star like bang sh...  48.076500   
5    2017-07-06  amazon googl news giant undersea river know li...  48.233002   
6    2017-07-07  amazon googl news whole food store evergreen p...  48.477501   
9    2017-07-10  amazon googl news amazon expand chicago offic ...  49.250000   
10   2017-07-11  amazon googl news new airship unlock amazon bb...  49.650002   
..          ...                                                ...        ...   
174  2017-12-22  amazon googl news alteryx data breach expo mil...  58.604000   
178  2017-12-26  amazon googl news last philli blackown booksto...  58.417999   
179  2017-12-27  amazon googl news bezo amazon top bloomberg bi...  58.995499   
180  2017-12-28  amazon googl news epa probe pollut scrap yard ...  59.450001   
181  2017-12-29  amazon googl news presid trump call postag rat...  59.117500   

          High        Low  

## **TRAINING THE DATA**

### **Logistic Regression**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# Read in the data
data = pd.read_csv('output.csv', encoding = "ISO-8859-1")
data.head(1)
data = data[['Headline','Label']]
# Delet "amazon googl news"
data['Headline'] = data['Headline'].str.replace("amazon googl news", "", regex=False)

In [None]:
# Set train set and test set
train, test = train_test_split(data, test_size=0.3)
headlines = train['Headline'].astype(str).tolist()

In [None]:
# Convert to word vector with single word, create sparse matrices 
basicvectorizer = CountVectorizer(ngram_range=(1,1))
basictrain = basicvectorizer.fit_transform(headlines)
print(basictrain.shape)

(87, 4434)


In [None]:
# Train Model
basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train["Label"])

In [None]:
# Prediction
testheadlines = test['Headline'].astype(str).tolist()
basictest = basicvectorizer.transform(testheadlines)
predictions = basicmodel.predict(basictest)
predictions
pd.crosstab(test["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

In [None]:
# Evaluation model
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix

print (classification_report(test["Label"], predictions))
print (accuracy_score(test["Label"], predictions))

              precision    recall  f1-score   support

         0.0       0.47      0.37      0.41        19
         1.0       0.48      0.58      0.52        19

    accuracy                           0.47        38
   macro avg       0.47      0.47      0.47        38
weighted avg       0.47      0.47      0.47        38

0.47368421052631576


In [None]:
# Convert to word vector with ngram=2 & ngram=3, create sparse matrices 
basicvectorizer2 = CountVectorizer(ngram_range=(2,3))
basictrain2 = basicvectorizer2.fit_transform(headlines)
print(basictrain2.shape)

basicmodel2 = LogisticRegression()
basicmodel2 = basicmodel2.fit(basictrain2, train["Label"])

basictest2 = basicvectorizer2.transform(testheadlines)
predictions2 = basicmodel2.predict(basictest2)

pd.crosstab(test["Label"], predictions2, rownames=["Actual"], colnames=["Predicted"])

print (classification_report(test["Label"], predictions2))
print (accuracy_score(test["Label"], predictions2))

(87, 23786)
              precision    recall  f1-score   support

         0.0       0.58      0.37      0.45        19
         1.0       0.54      0.74      0.62        19

    accuracy                           0.55        38
   macro avg       0.56      0.55      0.54        38
weighted avg       0.56      0.55      0.54        38

0.5526315789473685
              precision    recall  f1-score   support

         0.0       0.58      0.37      0.45        19
         1.0       0.54      0.74      0.62        19

    accuracy                           0.55        38
   macro avg       0.56      0.55      0.54        38
weighted avg       0.56      0.55      0.54        38

0.5526315789473685


### **BERT Model**

In [None]:
# Convert format
data['Headline'] = data['Headline'].astype(str)
data['Label'] = data['Label'].astype(int)

In [None]:
# Split the data into training and testing sets with a 7:3 ratio
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

In [None]:
!pip install transformers
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report


class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        headline = row['Headline']
        label = row['Label']
        inputs = self.tokenizer(headline, return_tensors="pt", max_length=self.max_len, padding=True, truncation=True)
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        return input_ids, attention_mask, label

# Config
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

max_len = 128
batch_size = 16
epochs = 5

train_dataset = NewsDataset(train_data, tokenizer, max_len)
test_dataset = NewsDataset(test_data, tokenizer, max_len)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# Train Model
from typing import List
import torch

def pad_collate_fn(batch: List[torch.Tensor]):
    input_ids, attention_mask, labels = zip(*batch)
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.tensor(labels, dtype=torch.long)

    return input_ids, attention_mask, labels

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_fn)

for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [elem.to(device) for elem in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()



In [None]:
# Evaluation model
model.eval()
predictions = []
ground_truth = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [elem.to(device) for elem in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, 1)
        predictions.extend(preds.cpu().numpy().tolist())
        ground_truth.extend(labels.cpu().numpy().tolist())

report = classification_report(ground_truth, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.56      0.33      0.42        15
           1       0.66      0.83      0.73        23

    accuracy                           0.63        38
   macro avg       0.61      0.58      0.57        38
weighted avg       0.62      0.63      0.61        38



In [None]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['Headline'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, data['Label'], test_size=0.2, random_state=42)

clf = MultinomialNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

ValueError: ignored

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.0


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       1.0
         1.0       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
