In [1]:
import pandas as pd

In [2]:
cnbc = pd.read_csv('cnbc_headlines.csv')

In [3]:
cnbc.head()

Unnamed: 0,Headlines,Time,Description
0,Jim Cramer: A better way to invest in the Covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying..."
1,Cramer's lightning round: I would own Teradyne,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin..."
2,,,
3,"Cramer's week ahead: Big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co..."
4,IQ Capital CEO Keith Bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be..."


In [4]:
cnbc.drop(columns = ['Time'] , inplace=True)

In [5]:
cnbc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3080 entries, 0 to 3079
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Headlines    2800 non-null   object
 1   Description  2800 non-null   object
dtypes: object(2)
memory usage: 48.3+ KB


In [6]:
cnbc.isnull().sum()

Headlines      280
Description    280
dtype: int64

In [7]:
cnbc.dropna(inplace=True)

In [8]:
cnbc.duplicated().sum()

10

In [9]:
cnbc.drop_duplicates(inplace = True)

In [10]:
cnbc.to_csv('main_cnbc.csv')

## Data Transformation

In [11]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self signed certificate in certificate chain
[nltk_data]     (_ssl.c:1002)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self signed certificate in certificate chain
[nltk_data]     (_ssl.c:1002)>


In [12]:
def text_transform(text):
    text = str(text)
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    
    lemmatizer = WordNetLemmatizer()
    
    for i in text:
        if i.isalnum() and (i not in stopwords.words('english') and i not in string.punctuation):
            i = lemmatizer.lemmatize(i)
            y.append(i)

    
    return " ".join(y) # return the list in the form of string    

In [13]:
for col in cnbc.columns:
    cnbc[col] = cnbc[col].apply(text_transform)
    print(col , "done")

Headlines done
Description done


In [14]:
cnbc.to_csv('transformed_cnbc.csv')

## Model Building

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
cnbc['combined_text'] = cnbc[cnbc.columns].apply(lambda row : ''.join(row.values.astype(str)) , axis = 1)

In [16]:
cnbc.head()

Unnamed: 0,Headlines,Description,combined_text
0,jim cramer better way invest vaccine gold rush,mad money host jim cramer recommended buying f...,jim cramer better way invest vaccine gold rush...
1,cramer lightning round would teradyne,mad money host jim cramer ring lightning round...,cramer lightning round would teradynemad money...
3,cramer week ahead big week earnings even bigge...,pay earnings company lancet publishes good new...,cramer week ahead big week earnings even bigge...
4,iq capital ceo keith bliss say tech healthcare...,keith bliss iq capital ceo join closing bell t...,iq capital ceo keith bliss say tech healthcare...
5,wall street delivered pullback waiting jim cra...,look stock company going lower even though des...,wall street delivered pullback waiting jim cra...


In [18]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cnbc['combined_text'])

In [19]:
vocab = vectorizer.get_feature_names_out() # saving vocabulary and tf-idf matrix

In [31]:
df = pd.read_csv('main_cnbc.csv')

In [32]:
def search_query(query , tfidf_matrix , vectorizer):
    preprocessed_query = text_transform(query)
    query_vector = vectorizer.transform([preprocessed_query])
    
    similarity_scores = cosine_similarity(query_vector , tfidf_matrix)
    top_indices = similarity_scores.argsort()[0][::-1]
    
    return df.iloc[top_indices[:5]]

In [33]:
query = "Financial Capital of America"
search_results = search_query(query, tfidf_matrix, vectorizer)
search_results[['Headlines' , 'Description']]

Unnamed: 0,Headlines,Description
2136,Bank of America CEO: Our mobile banking streng...,Jim Cramer sits down with Bank of America Chai...
803,Companies founded only by women receive 3% of ...,"""A founder who sees a huge market opportunity ..."
2133,GDP could rise to 4 percent in Q2 thanks to bu...,Jim Cramer sits down with Bank of America Chai...
1456,Bank of America Merrill Lynch: A strong China ...,"The ""real deal"" could spark a 10 percent rally..."
1076,Bank of America CEO on the importance of digit...,Bank of America CEO Brian Moynihan tells Jim C...


In [34]:
import pickle
pickle.dump( tfidf_matrix, open('tfidf-matrix.pkl' , 'wb'))
pickle.dump(vectorizer , open('vectorizer.pkl' , 'wb'))