<a href="https://colab.research.google.com/github/allisonlinn/CSUREMM/blob/main/Copy_of_Medium_Try1_(logistic%2C_climate).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#install needed packages
!pip install snorkel
!pip install textblob
#import libraries and modules
from google.colab import files
import io
import pandas as pd
#Snorkel
from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from textblob import TextBlob
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function
#NLP packages
import spacy
from nltk.corpus import stopwords
import string
import nltk
import nltk.tokenize
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#Supervised learning
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
##Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df = pd.read_csv('newcorrect.csv')

column_names = df.columns
print(column_names)

Index(['Date', 'Headline'], dtype='object')


In [6]:
df.head(21)


Unnamed: 0,Date,Headline
0,1/3/2014,Kerry Making Pact on Climate A Top Priority
1,1/7/2014,Big four EU economies seek tougher cuts in gas...
2,1/8/2014,EU makes carbon pollution more expensive
3,1/10/2014,Scientists back David Cameron on weather link ...
4,1/13/2014,EU considers scrapping 2030 binding renewables...
5,1/14/2014,"Compelling case for global deal on climate, sa..."
6,1/15/2014,Falling clean energy investment threatens UN c...
7,1/15/2014,"Under Investor Pressure, Utility to Study Emis..."
8,1/15/2014,Shale gas is no silver bullet for EU energy ma...
9,1/16/2014,EPA denies politics delayed pollution rules


In [7]:
#cleaning
df = df.rename(columns = {'Headline': 'text'})
df['text'] = df['text'].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13210 entries, 0 to 13209
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    13210 non-null  object
 1   text    13210 non-null  object
dtypes: object(2)
memory usage: 206.5+ KB


In [8]:
df.head(21)

Unnamed: 0,Date,text
0,1/3/2014,Kerry Making Pact on Climate A Top Priority
1,1/7/2014,Big four EU economies seek tougher cuts in gas...
2,1/8/2014,EU makes carbon pollution more expensive
3,1/10/2014,Scientists back David Cameron on weather link ...
4,1/13/2014,EU considers scrapping 2030 binding renewables...
5,1/14/2014,"Compelling case for global deal on climate, sa..."
6,1/15/2014,Falling clean energy investment threatens UN c...
7,1/15/2014,"Under Investor Pressure, Utility to Study Emis..."
8,1/15/2014,Shale gas is no silver bullet for EU energy ma...
9,1/16/2014,EPA denies politics delayed pollution rules


In [9]:
# define constants

POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1

# define function which looks into input words to ascribe labels
def keyword_lookup(x, keywords, label):
  if any(word in x.text.lower() for word in keywords):
    return label
  return ABSTAIN

# define function wwhich assigns a correct label
def make_keyword_lf(keywords, label=POSITIVE):
  return LabelingFunction(
      name = f"keyword_{keywords[0]}",
      f=keyword_lookup,
      resources=dict(keywords=keywords,label=label)
  )

  #these two lists can be further extended
"""positive news might contain the following words' """
keyword_positive = make_keyword_lf(keywords=['curbs', 'solar', 'renewable', 'tax', 'regulation', 'electric', 'green', 'agreement', 'trees', 'decarbonize', 'pledge', 'clean', 'alliance', 'bipartisan', 'reduce', 'adopt', 'vow', 'announce', 'positive', 'happy', 'bright', 'confident', 'encouraged', 'negotiations', 'promote', 'talks' ])

"""negative news might contain the following words"""
keyword_negative = make_keyword_lf(keywords=['withdraw','abandon', 'coal', 'fossil fuel','rescind', 'oil', 'reject', 'emissions', 'methane', 'pollution', 'gas', 'dismisses', 'attacks', 'defeat', 'damage', 'natural gas', 'dead', 'fear', 'foul', 'fails', 'hostile', 'cuts', 'quash', 'victims',  'derail', 'deny', 'harm', 'against', 'disagreement', 'denier', 'weaken', 'bad', 'alarming', 'costs', 'worries', 'dismiss', 'pain', 'poison', 'unfair', 'unhealthy'
                                              ], label=NEGATIVE)

In [10]:
# set up a preprocessor for polarity and subjectivity using textlob
@preprocessor(memoize=True)
def textblob_sentiment(x):
  scores = TextBlob(x.text)
  x.polarity = scores.sentiment.polarity
  x.subjectivity = scores.sentiment.subjectivity
  return x

#find polarity
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
  return POSITIVE if x.polarity > 0.6 else ABSTAIN

#find subjectivity
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
  return POSITIVE if x.subjectivity >= 0.5 else ABSTAIN

In [11]:
#combine labeling functions
lfs = [keyword_positive, keyword_negative, textblob_polarity, textblob_subjectivity ]

#apply the lfs on the df
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df)

#apply the label model
label_model = LabelModel(cardinality=2, verbose = True)
#fit on the data
label_model.fit(L_snorkel)
#predict and create the labels
df["label"] = label_model.predict(L=L_snorkel)

df.head()

100%|██████████| 13210/13210 [00:23<00:00, 565.02it/s] 
100%|██████████| 100/100 [00:00<00:00, 923.11epoch/s]


Unnamed: 0,Date,text,label
0,1/3/2014,Kerry Making Pact on Climate A Top Priority,1
1,1/7/2014,Big four EU economies seek tougher cuts in gas...,0
2,1/8/2014,EU makes carbon pollution more expensive,1
3,1/10/2014,Scientists back David Cameron on weather link ...,-1
4,1/13/2014,EU considers scrapping 2030 binding renewables...,1


In [None]:
#Logistic Regression: binary classifier, estimates probability of an instance belonging to a class, makes according predictions

#text-preprocessing: tokenizing, lemmatizing, removal of stop words, removal of punctuation

In [12]:
#make a copy of the dataframe
data = df.copy()
#define a function which handles the text preprocessing
def preparation_text_data(data):
    """
    This pipeline prepares the text data, conducting the following steps:
    1) Tokenization
    2) Lemmatization
    4) Removal of stopwords
    5) Removal of punctuation
    """
    # initialize spacy object
    nlp = spacy.load('en_core_web_sm')
    # select raw text
    raw_text = data.text.values.tolist()
    # tokenize
    tokenized_text = [[nlp(i.lower().strip())] for i in tqdm(raw_text)]
    #define the punctuations and stop words
    punc = string.punctuation
    stop_words = set(stopwords.words('english'))
    #lemmatize, remove stopwords and punctuationd
    corpus = []
    for doc in tqdm(tokenized_text):
        corpus.append([word.lemma_ for word in doc[0] if (word.lemma_ not in stop_words and word.lemma_ not in punc)])
    # add prepared data to df
    data["text"] = corpus
    return data

#apply the data preprocessing function
data = preparation_text_data(data)

data.head(10)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tokenized_text = [[nlp(i.lower().strip())] for i in tqdm(raw_text)]


  0%|          | 0/13210 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for doc in tqdm(tokenized_text):


  0%|          | 0/13210 [00:00<?, ?it/s]

Unnamed: 0,Date,text,label
0,1/3/2014,"[kerry, make, pact, climate, top, priority]",1
1,1/7/2014,"[big, four, eu, economy, seek, tough, cut, gas...",0
2,1/8/2014,"[eu, make, carbon, pollution, expensive]",1
3,1/10/2014,"[scientist, back, david, cameron, weather, lin...",-1
4,1/13/2014,"[eu, consider, scrap, 2030, bind, renewable, t...",1
5,1/14/2014,"[compelling, case, global, deal, climate, say,...",-1
6,1/15/2014,"[fall, clean, energy, investment, threaten, un...",1
7,1/15/2014,"[investor, pressure, utility, study, emission]",0
8,1/15/2014,"[shale, gas, silver, bullet, eu, energy, market]",0
9,1/16/2014,"[epa, deny, politic, delay, pollution, rule]",0


In [13]:
data

Unnamed: 0,Date,text,label
0,1/3/2014,"[kerry, make, pact, climate, top, priority]",1
1,1/7/2014,"[big, four, eu, economy, seek, tough, cut, gas...",0
2,1/8/2014,"[eu, make, carbon, pollution, expensive]",1
3,1/10/2014,"[scientist, back, david, cameron, weather, lin...",-1
4,1/13/2014,"[eu, consider, scrap, 2030, bind, renewable, t...",1
...,...,...,...
13205,12/31/2022,"[japan, 's, neighbor, balk, plan, release, was...",-1
13206,12/31/2022,"[get, 1, bid, oil, gas, lease, alaska, 's, coo...",0
13207,1/1/2023,"[bank, need, financial, prod, tackle, climate,...",-1
13208,1/1/2023,"[uk, climate, group, call, temporary, halt, di...",-1


In [34]:
def text_representation(data):
  X_tfidf = None  # Initial assignment of X_tfidf

  try:
    tfidf_vect = TfidfVectorizer()
  except ValueError as e:
    pass

  data['text'] = data['text'].apply(lambda text: " ".join(set(text)))

  try:
    X_tfidf = tfidf_vect.fit_transform(data['text'])
  except ValueError as e:
    pass

  if X_tfidf is not None:
    print(X_tfidf.shape)
    print(tfidf_vect.get_feature_names())
    X_tfidf = pd.DataFrame(X_tfidf.toarray())

  return X_tfidf

# Apply the TF-IDF function
X_tfidf = text_representation(data)


In [30]:
X_tfidf = text_representation(data)

if X_tfidf is not None:
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.33, random_state=42)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    clf_score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    print("Accuracy:", clf_score)
    print(classification_report(y_test, y_pred))
else:
    pass



TypeError: ignored

In [24]:
X= X_tfidf
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#fit Log Regression Model
clf= LogisticRegression()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

TypeError: ignored

In [27]:
new_data = ["new carbon tax expected to fail"]
tf = TfidfVectorizer()
tfdf = tf.fit_transform(data['text'])
vect = pd.DataFrame(tf.transform(new_data).toarray())
new_data = pd.DataFrame(vect)
logistic_prediction = clf.predict(new_data)
print(logistic_prediction)

ValueError: ignored