# Top k main Attributes

# Installations

In [None]:
!pip install spacy 
!python -m spacy download en_core_web_lg

# Import Libraries

In [None]:
import spacy
import os
import csv
import nltk 
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

In [None]:
nltk.download()

# Functions

In [None]:
def load_tweets(tweets_csv_path:str, tweets_min_char_threshold:int
    ) -> list: 
    """
    Loads the reviews from the given input folder.
    The third parameter is the minimium acceptable number of chars in a review
    """
    tweets = list() 
    
    with open(tweets_csv_path,encoding='utf8') as fin: # open reviews file 
        reader=csv.reader(fin) # create a csv reader 
        next(reader)#skip header
        for row in reader:
            if len(row[0]) >= tweets_min_char_threshold:
                tweets.append(row[0])
                
    return tweets
    
def tokenize_tweets(tweet_list: str, nlp: spacy.lang.en.English
    ) -> list:
    """
    Loads the reviews from the given input folder.
    The third parameter is the minimium acceptable number of chars in a tweet
    """
    processed_tweet_list = list()
    for tweet in tweet_list:
        processed_tweet = list() # holds the processed version of the tweet

        sentences=sent_tokenize(tweet) # split the tweet into sentences
        for sentence in sentences: # for each sentence
            processed_tweet.append(nlp(sentence)) # process sentence and add it to the tweet list
        
        processed_tweet_list.append(processed_tweet)
    return processed_tweet_list

def get_aspects(tweet_list: list, aspect_num: int
    ) -> list:
    """
    Loads the reviews from the given input folder.
    The third parameter is the minimium acceptable number of chars in a review
    """
    stopLex=set(stopwords.words('english'))# load stopwords

    freq=defaultdict(int)
         
    for tweet in tweet_list: 
        for sentence in tweet: # for each sentence
            for token in sentence: # for each token (we are not working with simple terms any more)

                term=token.text.lower()

                #ignore stopwords, short words, and non-nouns
                if  (term not in stopLex) and (len(term)>=3) and (token.pos_=='NOUN'):
                    freq[term]+=1
              
    # sort terms by freq in descending order and keep the top k
    my_top=sorted(freq.items(),key=lambda x:x[1], reverse=True)[:aspect_num]
        
    return my_top

In [None]:
tweets_csv_path = "ENTER YOUR CSV FILE"
nlp = spacy.load("en_core_web_lg")
tweets_min_char_threshold = 10 
aspect_num = 60 #input the number you want

In [None]:
tweets_list = load_tweets(tweets_csv_path=tweets_csv_path,tweets_min_char_threshold=tweets_min_char_threshold)
token_list = tokenize_tweets(tweet_list=tweets_list, nlp=nlp)
aspect_list = get_aspects(tweet_list=token_list, aspect_num=aspect_num)

## COLLECTING TOP FEATURES

In [None]:
aspect_list