#### Import Modules

In [122]:
import sys, os
import re
import pandas as pd
import cohere
from cohere.classify import Example
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from read_write_util import ReadWriteUtil
from prompt_pipeline import PromptPipeline

In [48]:
reader = ReadWriteUtil()
API_KEY = os.getenv('API_KEY')
# print(API_KEY)
#Initialize reader and API_KEY

In [49]:
df = pd.read_csv('../data/news.csv')
df.head()
#Read data

Unnamed: 0,Domain,Title,Description,Body,Link,timestamp,Analyst_Average_Score,Analyst_Rank,Reference_Final_Score
0,rassegnastampa.news,Boris Johnson using a taxpayer-funded jet for ...,…often trigger a protest vote that can upset…t...,Boris Johnson using a taxpayer-funded jet for ...,https://rassegnastampa.news/boris-johnson-usin...,2021-09-09T18:17:46.258006,0.0,4,1.96
1,twitter.com,"Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...",http://twitter.com/CoruscaKhaya/status/1435585...,2021-09-08T13:02:45.802298,0.0,4,12.0
2,atpe-tchad.info,Marché Résines dans les peintures et revêtemen...,…COVID-19…COVID…COVID…COVID-19 et Post COVID…C...,Le rapport d’étude de marché Résines dans les ...,http://atpe-tchad.info/2021/09/13/marche-resin...,2021-09-13T07:32:46.244403,0.0,4,0.05
3,badbluetech.bitnamiapp.com,"AI drives data analytics surge, study finds",…hate raiders' linked to automated harassment ...,How to drive the funnel through content market...,http://badbluetech.bitnamiapp.com/p.php?sid=21...,2021-09-11T00:17:45.962605,0.0,4,6.1
4,kryptogazette.com,Triacetin Vertrieb Markt 2021: Globale Unterne...,…Abschnitten und Endanwendungen / Organisation...,Global Triacetin Vertrieb-Markt 2021 von Herst...,https://kryptogazette.com/2021/09/08/triacetin...,2021-09-08T12:47:46.078369,0.0,4,0.13


`Lets check for lengths of the body`

In [50]:
lengths = df['Body'].apply(lambda x: len(x))
lengths


0    13712
1      267
2     8273
3    22098
4     9554
5     1899
6     1729
7     3061
8     4428
9     5257
Name: Body, dtype: int64

`We can see that the lowest number of charachters we have is 267 most are above 2048 which is the highest number of tokens accepted by cohere, so we have to find a new approach to preprocess this data`

`We will try to understand the context of the body`

### Cleaning and Preprocessing

#### Convert to lower case

In [51]:
df['body_lower'] = [text.lower() for text in df['Body']]

In [52]:
df['body_lower']

0    boris johnson using a taxpayer-funded jet for ...
1    stumbled across an interesting case, a woman f...
2    le rapport d’étude de marché résines dans les ...
3    how to drive the funnel through content market...
4    global triacetin vertrieb-markt 2021 von herst...
5    south african police service office of the pro...
6    today is the 7th anniversary [tragic collapse ...
7    construction activity grew steadily by 4% in t...
8    - former eskom ceo matshela moses koko sought ...
9    global and regional beta-carotene market resea...
Name: body_lower, dtype: object

#### Tokenization

In [53]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/n/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [54]:
wtoken = [word_tokenize(text) for text in df['body_lower']]
# wtoken
stoken = [sent_tokenize(text) for text in df['body_lower']]

#### Remove Punctuation

In [55]:
reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')

no_punc = []

for filt in stoken:
    review = []
    for token in filt:
        new_token = reg.sub(u'', token)
        if not new_token == u'':
            review.append(new_token)
    no_punc.append(review)
    
# print(no_punc)

#### Remove Stop Words

In [56]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/n/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
no_stop = []

for text in no_punc:
    new_term_vector = []
    for word in text:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
            
    no_stop.append(new_term_vector)

#### Stemming and lemmatization

In [58]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/n/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [59]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/n/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [60]:
pstem = PorterStemmer()
wlem = WordNetLemmatizer()
preproc_text = []

for text in no_stop:
    final_text = []
    for word in text:
        pstem.stem(word)
        final_text.append(wlem.lemmatize(word))
    
    preproc_text.append(final_text)


#### Remove web words

In [61]:
reg = re.compile(r'(http)|(^www)|(html$)|(\w*\d{3,}\w*)')

tuned_text = []

for filt in preproc_text:
    review = []
    for token in filt:
        new_token = reg.sub(u'', token)
        if not new_token == u'':
            review.append(new_token)
    tuned_text.append(review)

In [62]:
df['context_txt'] = [" ".join(word) for word in tuned_text]
le = df['context_txt'].apply(lambda x: len(x))
le1 = df['Body'].apply(lambda x: len(x))
le,le1

(0    13189
 1      259
 2     5692
 3    19907
 4     8613
 5     1672
 6     1630
 7     2795
 8     4254
 9     4877
 Name: context_txt, dtype: int64,
 0    13712
 1      267
 2     8273
 3    22098
 4     9554
 5     1899
 6     1729
 7     3061
 8     4428
 9     5257
 Name: Body, dtype: int64)

`We can see that we still have more characters than the accepted one, so we have to discard from our prompt the one that have more characters`

In [63]:
title_length = df['Title'].apply(lambda x: len(x))
description_length = df['Description'].apply(lambda x: len(x))
title_length

0    121
1    267
2    222
3     43
4    156
5    103
6     54
7     78
8     84
9    150
Name: Title, dtype: int64

In [64]:
description_length

0    251
1    267
2    245
3    248
4    251
5    251
6    253
7    253
8    252
9    249
Name: Description, dtype: int64

### Preprocess description 

#### Convert to lowercase

In [65]:
df['desc_lower'] = [text.lower() for text in df['Description']]
df['desc_lower']

0    …often trigger a protest vote that can upset…t...
1    stumbled across an interesting case, a woman f...
2    …covid-19…covid…covid…covid-19 et post covid…c...
3    …hate raiders' linked to automated harassment ...
4    …abschnitten und endanwendungen / organisation...
5    …crime stamp out…n1 and r101 roads appear in c...
6    …in lagos, nigeria, 84 south africans were kil...
7    …additional spending on buildings, repairs and...
8    …lawsuit against public participation) designe...
9    …key players! – dsm – basf – allied biotech – ...
Name: desc_lower, dtype: object

#### Tokenization

In [66]:
desc_sent_token = [sent_tokenize(text) for text in df['Description']]

#### Removing punctuation

In [67]:
reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')

no_punc = []

for filt in desc_sent_token:
    review = []
    for token in filt:
        new_token = reg.sub(u'', token)
        if not new_token == u'':
            review.append(new_token)
    no_punc.append(review)


#### Removing stop words

In [68]:
no_stop = []

for text in no_punc:
    new_term_vector = []
    for word in text:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
            
    no_stop.append(new_term_vector)

#### Stemming and lemmatization

In [69]:
preproc_text = []

for text in no_stop:
    final_text = []
    for word in text:
        pstem.stem(word)
        final_text.append(wlem.lemmatize(word))
    
    preproc_text.append(final_text)

In [70]:
reg = re.compile(r'(http)|(^www)|(html$)|(\w*\d{3,}\w*)')

tuned_text = []

for filt in preproc_text:
    review = []
    for token in filt:
        new_token = reg.sub(u'', token)
        if not new_token == u'':
            review.append(new_token)
    tuned_text.append(review)

In [71]:

df['desc_context_txt'] = [" ".join(word) for word in tuned_text]

In [72]:
df['desc_context_txt']

0    often trigger a protest vote that can upsettha...
1    tumbled across an interesting case a woman fac...
2     et ost ovid19  valuation des risques lis au 1...
3    hate raiders linked to automated harassment ca...
4    bschnitten und ndanwendungen  rganisationen ov...
5    rime tamp ut1 and  roads appear in court   rim...
6    in agos igeria 84 outh fricans were killed he ...
7    additional spending on buildings repairs and s...
8    awsuit gainst ublic articipation designed to i...
9    key players      llied iotech  hrhistorical ma...
Name: desc_context_txt, dtype: object

In [170]:
lengths = df['desc_context_txt'].apply(lambda x: len(x)) 
lengths

0    239
1    252
2    139
3    217
4    207
5    173
6    212
7    241
8    221
9    193
Name: desc_context_txt, dtype: int64

### Classify to Groups/Classes Based on Analyst_Average_Score

In [73]:
df

Unnamed: 0,Domain,Title,Description,Body,Link,timestamp,Analyst_Average_Score,Analyst_Rank,Reference_Final_Score,body_lower,context_txt,desc_lower,desc_context_txt
0,rassegnastampa.news,Boris Johnson using a taxpayer-funded jet for ...,…often trigger a protest vote that can upset…t...,Boris Johnson using a taxpayer-funded jet for ...,https://rassegnastampa.news/boris-johnson-usin...,2021-09-09T18:17:46.258006,0.0,4,1.96,boris johnson using a taxpayer-funded jet for ...,boris johnson using a taxpayerfunded jet for a...,…often trigger a protest vote that can upset…t...,often trigger a protest vote that can upsettha...
1,twitter.com,"Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...",http://twitter.com/CoruscaKhaya/status/1435585...,2021-09-08T13:02:45.802298,0.0,4,12.0,"stumbled across an interesting case, a woman f...",stumbled across an interesting case a woman fa...,"stumbled across an interesting case, a woman f...",tumbled across an interesting case a woman fac...
2,atpe-tchad.info,Marché Résines dans les peintures et revêtemen...,…COVID-19…COVID…COVID…COVID-19 et Post COVID…C...,Le rapport d’étude de marché Résines dans les ...,http://atpe-tchad.info/2021/09/13/marche-resin...,2021-09-13T07:32:46.244403,0.0,4,0.05,le rapport d’étude de marché résines dans les ...,le rapport dtude de march rsines dans les pein...,…covid-19…covid…covid…covid-19 et post covid…c...,et ost ovid19 valuation des risques lis au 1...
3,badbluetech.bitnamiapp.com,"AI drives data analytics surge, study finds",…hate raiders' linked to automated harassment ...,How to drive the funnel through content market...,http://badbluetech.bitnamiapp.com/p.php?sid=21...,2021-09-11T00:17:45.962605,0.0,4,6.1,how to drive the funnel through content market...,how to drive the funnel through content market...,…hate raiders' linked to automated harassment ...,hate raiders linked to automated harassment ca...
4,kryptogazette.com,Triacetin Vertrieb Markt 2021: Globale Unterne...,…Abschnitten und Endanwendungen / Organisation...,Global Triacetin Vertrieb-Markt 2021 von Herst...,https://kryptogazette.com/2021/09/08/triacetin...,2021-09-08T12:47:46.078369,0.0,4,0.13,global triacetin vertrieb-markt 2021 von herst...,global triacetin vertriebmarkt von hersteller...,…abschnitten und endanwendungen / organisation...,bschnitten und ndanwendungen rganisationen ov...
5,mype.co.za,Male arrested for the murder of an elderly fem...,…Crime Stamp Out…N1 and R101 roads appear in c...,South African Police Service Office of the Pro...,https://mype.co.za/new/male-arrested-for-the-m...,2021-09-10T00:17:46.055622,1.33,2,11.0,south african police service office of the pro...,south african police service office of the pro...,…crime stamp out…n1 and r101 roads appear in c...,rime tamp ut1 and roads appear in court rim...
6,eminetra.co.za,7th Anniversary of SCOAN Collapse in Nigeria-S...,"…in Lagos, Nigeria, 84 South Africans were kil...",Today is the 7th anniversary [Tragic collapse ...,https://eminetra.co.za/7th-anniversary-of-scoa...,2021-09-12T05:17:50.279081,0.0,4,10.1,today is the 7th anniversary [tragic collapse ...,today is the 7th anniversary tragic collapse o...,"…in lagos, nigeria, 84 south africans were kil...",in agos igeria 84 outh fricans were killed he ...
7,eminetra.co.za,The construction sector is expected to be boos...,"…additional spending on buildings, repairs and...",Construction activity grew steadily by 4% in t...,https://eminetra.co.za/the-construction-sector...,2021-09-09T09:02:46.320793,1.66,1,1.36,construction activity grew steadily by 4% in t...,construction activity grew steadily by 4 in th...,"…additional spending on buildings, repairs and...",additional spending on buildings repairs and s...
8,news24.com,News24.com | Court dismisses attempt by former...,…Lawsuit Against Public Participation) designe...,- Former Eskom CEO Matshela Moses Koko sought ...,https://www.news24.com/news24/southafrica/news...,2021-09-09T19:32:46.239682,0.33,3,2.4,- former eskom ceo matshela moses koko sought ...,former eskom ceo matshela moses koko sought ...,…lawsuit against public participation) designe...,awsuit gainst ublic articipation designed to i...
9,manometcurrent.com,Global and Regional Beta-Carotene Market Resea...,…key players! – DSM – BASF – Allied Biotech – ...,Global and Regional Beta-Carotene Market Resea...,https://manometcurrent.com/global-and-regional...,2021-09-13T03:02:45.609228,0.0,4,0.22,global and regional beta-carotene market resea...,global and regional betacarotene market resear...,…key players! – dsm – basf – allied biotech – ...,key players llied iotech hrhistorical ma...


In [74]:
rank__to_10 = df['Analyst_Average_Score'].apply(lambda x: 'low' if x < 5 else 'high')
rank__to_10
#We will add new column with value 'high' for Analyst_Average_Score >= 5 and 'low' for Analyst_Average_Score < 5 and >=0

0    low
1    low
2    low
3    low
4    low
5    low
6    low
7    low
8    low
9    low
Name: Analyst_Average_Score, dtype: object

In [75]:
df['rank_1'] = rank__to_10
df['rank_1']

0    low
1    low
2    low
3    low
4    low
5    low
6    low
7    low
8    low
9    low
Name: rank_1, dtype: object

In [76]:
def handle_sub_class(value):
    if value >= 0 and value < 1:
        return "low_1"
    elif value >= 1 and value < 2:
        return "low_2"
    elif value >= 2 and value < 3:
        return "low_3"
    elif value >= 3 and value < 4:
        return "low_4"
    elif value >= 4 and value < 5:
        return "low_5"
    elif value >= 5 and value < 6:
        return "high_1"
    elif value >= 6 and value < 7:
        return "high_2"
    elif value >= 7 and value < 8:
        return "high_3"
    elif value >= 8 and value < 9:
        return "high_4"
    else:
        return "high_5"

#We will add new column with value 'high_1' for rows having Analyst_Average_Score value of 5-6, high_2 for rows having Analyst_Average_Score value of 6-7 til high_5 for rows having Analyst_Average_Score value of 9-10 
#We will add new column with value 'low_1' for rows having Analyst_Average_Score value of 0-1, low_2 for rows having Analyst_Average_Score value of 1-2 til low_5 for rows having Analyst_Average_Score value of 4-5 

In [77]:
df['rank_2'] = df['Analyst_Average_Score'].apply(lambda x: handle_sub_class(x))
df['rank_2']

0    low_1
1    low_1
2    low_1
3    low_1
4    low_1
5    low_2
6    low_1
7    low_2
8    low_1
9    low_1
Name: rank_2, dtype: object

In [92]:
df['rank_3'] = df['Analyst_Average_Score'].apply(lambda x: handle_sub_class(int("{:.2f}".format(x)[2])))
df['rank_3']

0     low_1
1     low_1
2     low_1
3     low_1
4     low_1
5     low_4
6     low_1
7    high_2
8     low_4
9     low_1
Name: rank_3, dtype: object

In [93]:
df['rank_4'] = df['Analyst_Average_Score'].apply(lambda x: handle_sub_class(int("{:.2f}".format(x)[3])))
df['rank_4']

0     low_1
1     low_1
2     low_1
3     low_1
4     low_1
5     low_4
6     low_1
7    high_2
8     low_4
9     low_1
Name: rank_4, dtype: object

In [94]:
df

Unnamed: 0,Domain,Title,Description,Body,Link,timestamp,Analyst_Average_Score,Analyst_Rank,Reference_Final_Score,body_lower,context_txt,desc_lower,desc_context_txt,rank_1,rank_2,rank_3,rank_4
0,rassegnastampa.news,Boris Johnson using a taxpayer-funded jet for ...,…often trigger a protest vote that can upset…t...,Boris Johnson using a taxpayer-funded jet for ...,https://rassegnastampa.news/boris-johnson-usin...,2021-09-09T18:17:46.258006,0.0,4,1.96,boris johnson using a taxpayer-funded jet for ...,boris johnson using a taxpayerfunded jet for a...,…often trigger a protest vote that can upset…t...,often trigger a protest vote that can upsettha...,low,low_1,low_1,low_1
1,twitter.com,"Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...",http://twitter.com/CoruscaKhaya/status/1435585...,2021-09-08T13:02:45.802298,0.0,4,12.0,"stumbled across an interesting case, a woman f...",stumbled across an interesting case a woman fa...,"stumbled across an interesting case, a woman f...",tumbled across an interesting case a woman fac...,low,low_1,low_1,low_1
2,atpe-tchad.info,Marché Résines dans les peintures et revêtemen...,…COVID-19…COVID…COVID…COVID-19 et Post COVID…C...,Le rapport d’étude de marché Résines dans les ...,http://atpe-tchad.info/2021/09/13/marche-resin...,2021-09-13T07:32:46.244403,0.0,4,0.05,le rapport d’étude de marché résines dans les ...,le rapport dtude de march rsines dans les pein...,…covid-19…covid…covid…covid-19 et post covid…c...,et ost ovid19 valuation des risques lis au 1...,low,low_1,low_1,low_1
3,badbluetech.bitnamiapp.com,"AI drives data analytics surge, study finds",…hate raiders' linked to automated harassment ...,How to drive the funnel through content market...,http://badbluetech.bitnamiapp.com/p.php?sid=21...,2021-09-11T00:17:45.962605,0.0,4,6.1,how to drive the funnel through content market...,how to drive the funnel through content market...,…hate raiders' linked to automated harassment ...,hate raiders linked to automated harassment ca...,low,low_1,low_1,low_1
4,kryptogazette.com,Triacetin Vertrieb Markt 2021: Globale Unterne...,…Abschnitten und Endanwendungen / Organisation...,Global Triacetin Vertrieb-Markt 2021 von Herst...,https://kryptogazette.com/2021/09/08/triacetin...,2021-09-08T12:47:46.078369,0.0,4,0.13,global triacetin vertrieb-markt 2021 von herst...,global triacetin vertriebmarkt von hersteller...,…abschnitten und endanwendungen / organisation...,bschnitten und ndanwendungen rganisationen ov...,low,low_1,low_1,low_1
5,mype.co.za,Male arrested for the murder of an elderly fem...,…Crime Stamp Out…N1 and R101 roads appear in c...,South African Police Service Office of the Pro...,https://mype.co.za/new/male-arrested-for-the-m...,2021-09-10T00:17:46.055622,1.33,2,11.0,south african police service office of the pro...,south african police service office of the pro...,…crime stamp out…n1 and r101 roads appear in c...,rime tamp ut1 and roads appear in court rim...,low,low_2,low_4,low_4
6,eminetra.co.za,7th Anniversary of SCOAN Collapse in Nigeria-S...,"…in Lagos, Nigeria, 84 South Africans were kil...",Today is the 7th anniversary [Tragic collapse ...,https://eminetra.co.za/7th-anniversary-of-scoa...,2021-09-12T05:17:50.279081,0.0,4,10.1,today is the 7th anniversary [tragic collapse ...,today is the 7th anniversary tragic collapse o...,"…in lagos, nigeria, 84 south africans were kil...",in agos igeria 84 outh fricans were killed he ...,low,low_1,low_1,low_1
7,eminetra.co.za,The construction sector is expected to be boos...,"…additional spending on buildings, repairs and...",Construction activity grew steadily by 4% in t...,https://eminetra.co.za/the-construction-sector...,2021-09-09T09:02:46.320793,1.66,1,1.36,construction activity grew steadily by 4% in t...,construction activity grew steadily by 4 in th...,"…additional spending on buildings, repairs and...",additional spending on buildings repairs and s...,low,low_2,high_2,high_2
8,news24.com,News24.com | Court dismisses attempt by former...,…Lawsuit Against Public Participation) designe...,- Former Eskom CEO Matshela Moses Koko sought ...,https://www.news24.com/news24/southafrica/news...,2021-09-09T19:32:46.239682,0.33,3,2.4,- former eskom ceo matshela moses koko sought ...,former eskom ceo matshela moses koko sought ...,…lawsuit against public participation) designe...,awsuit gainst ublic articipation designed to i...,low,low_1,low_4,low_4
9,manometcurrent.com,Global and Regional Beta-Carotene Market Resea...,…key players! – DSM – BASF – Allied Biotech – ...,Global and Regional Beta-Carotene Market Resea...,https://manometcurrent.com/global-and-regional...,2021-09-13T03:02:45.609228,0.0,4,0.22,global and regional beta-carotene market resea...,global and regional betacarotene market resear...,…key players! – dsm – basf – allied biotech – ...,key players llied iotech hrhistorical ma...,low,low_1,low_1,low_1


In [95]:
df['final_score'] = df['rank_1'].map(str)+'__'+df['rank_2'].map(str)+'__'+df['rank_3'].map(str)+'__'+df['rank_4'].map(str)
df['final_score']

0      low__low_1__low_1__low_1
1      low__low_1__low_1__low_1
2      low__low_1__low_1__low_1
3      low__low_1__low_1__low_1
4      low__low_1__low_1__low_1
5      low__low_2__low_4__low_4
6      low__low_1__low_1__low_1
7    low__low_2__high_2__high_2
8      low__low_1__low_4__low_4
9      low__low_1__low_1__low_1
Name: final_score, dtype: object

#### Save cleaned and classified data

In [96]:
df.to_csv('../data/news_processed.csv')

### Using few-shot classifications 

#### Using only the final score

In [187]:
X,y = df['Title'], df['final_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2, random_state=21)

In [188]:
intents = y_train.unique().tolist()
print(intents)
# View the list of all available final_score class

['low__low_2__high_2__high_2', 'low__low_1__low_1__low_1', 'low__low_2__low_4__low_4', 'low__low_1__low_4__low_4']


In [189]:
ex_texts, ex_labels = [], []
for intent in intents:
  ex_texts += X_train.tolist()
  ex_labels += y_train.tolist()

print(f'Number of classes: {len(intents)}')
print(f'Total number of examples: {len(ex_texts)}')

Number of classes: 4
Total number of examples: 32


In [190]:
examples = list()
for txt, lbl in zip(ex_texts,ex_labels):
  examples.append(Example(txt,lbl))

In [191]:
co = cohere.Client(API_KEY)

In [192]:
def classify_text(text,examples):
  classifications = co.classify(
    model='xlarge',
    inputs=[text],
    examples=examples
    )
  return classifications.classifications[0].prediction

In [214]:
y_pred = X_test.apply(classify_text, args=(examples,)).tolist()
y_pred

['low__low_1__low_1__low_1', 'low__low_1__low_1__low_1']

In [160]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {100*accuracy:.2f}')
print(f'F1-score: {100*f1:.2f}')

Accuracy: 100.00
F1-score: 100.00


#### Using Embedding

In [227]:
X,y = df['desc_context_txt'], df['final_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2, random_state=21)
y_test

1    low__low_1__low_1__low_1
2    low__low_1__low_1__low_1
Name: final_score, dtype: object

In [217]:
def embed_text(text):
  output = co.embed(
                model='medium',
                texts=text)
  return output.embeddings

In [228]:
X_train_emb = np.array(embed_text(X_train.tolist()))
X_test_emb = np.array(embed_text(X_test.tolist()))

In [229]:
from sklearn.svm import SVC
from sklearn import preprocessing

# Prepare the labels
le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train_le = le.transform(y_train)
y_test_le = le.transform(y_test)

# Initialize the model
svm_classifier = SVC(class_weight='balanced')

# Fit the training dataset to the model
svm_classifier.fit(X_train_emb, y_train_le)

SVC(class_weight='balanced')

In [230]:
y_pred_le = svm_classifier.predict(X_test_emb)
y_pred_le

array([0, 0])

In [231]:
accuracy = accuracy_score(y_test_le, y_pred_le)
f1 = f1_score(y_test_le, y_pred_le, average='weighted')

print(f'Accuracy: {100*accuracy:.2f}')
print(f'F1-score: {100*f1:.2f}')

Accuracy: 100.00
F1-score: 100.00


`still not accurate because the two test data can only have the score 0, they can't have an other value since we don't have enough samples, example we only have one value with 1.66 so even we label this and try to put it in the test set, it will complain saying that specific label is new, which means we have missed it in the training dataset`

`We will try to manipulate the data by changing the values of 'Analyst_Average_Score'`

In [232]:
df_m = df.copy()

In [None]:
df_m['Analyst_Average_Score'] = [0.00,0.00,2.33,2.33,1.33,1.33,1.66,1.66,0.33]