# Modules and session

In [81]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,isnan, when, count
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window

## Stopwords
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords

## Translate and polarity
from textblob import TextBlob
from textblob.exceptions import NotTranslated

[nltk_data] Downloading package punkt to /home/bluterplay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bluterplay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
spark=SparkSession.builder.appName('Translate').getOrCreate()

# Data

In [6]:
data=spark.read.option('header', 'true').csv('files.csv',inferSchema= True)

                                                                                

In [7]:
data.show(5)

+--------------------+------------+--------------------+
|        wav_filename|wav_filesize|          transcript|
+--------------------+------------+--------------------+
|audios/4da6b70e-0...|      219064|y aquí en dos pal...|
|audios/8c2ab30b-0...|      271910|cuando los consej...|
|audios/ca73c951-c...|       64520|su mujer con la c...|
|audios/067c4606-7...|       84222|y otros que se po...|
|audios/49a08f90-3...|       77316|al oír mis pasos ...|
+--------------------+------------+--------------------+
only showing top 5 rows



# Prepossessing

## Missings

In [8]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()



+------------+------------+----------+
|wav_filename|wav_filesize|transcript|
+------------+------------+----------+
|           0|           0|         0|
+------------+------------+----------+



                                                                                

In [9]:
data.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in data.columns]).show()



+------------+------------+----------+
|wav_filename|wav_filesize|transcript|
+------------+------------+----------+
|           0|           0|         0|
+------------+------------+----------+



                                                                                

## Translate

In [10]:
rdd= data.rdd

In [11]:
rdd.take(2)

                                                                                

[Row(wav_filename='audios/4da6b70e-0108-4f75-80ae-3d71f1dd2c2b.wav', wav_filesize=219064, transcript='y aquí en dos palotadas hemos encontrado robustas columnas donde apoyar la grandiosa fábrica de su alcurnia'),
 Row(wav_filename='audios/8c2ab30b-0fd4-41c3-9724-3b15f2ee2c27.wav', wav_filesize=271910, transcript='cuando los consejeros escucharon aquello quedaron estremecidos y se dijeron dios ha prohibido que padres se casen con sus hijas')]

### Drop stopwords

In [15]:
stopword_list  = stopwords.words('spanish')

In [16]:
rdd_word_list= rdd.map(lambda x : (x[0],x[1],x[2].split(" ")))

In [17]:
rdd_stopwords= rdd_word_list.map(lambda x: (x[0],x[1],[i for i in x[2] if i not in stopword_list])) \
                            .map(lambda x:(x[0],x[1], " ".join(x[2]))).filter(lambda x:x[2]!='')

In [18]:
rdd_stopwords.take(2)

[('audios/4da6b70e-0108-4f75-80ae-3d71f1dd2c2b.wav',
  219064,
  'aquí dos palotadas encontrado robustas columnas apoyar grandiosa fábrica alcurnia'),
 ('audios/8c2ab30b-0fd4-41c3-9724-3b15f2ee2c27.wav',
  271910,
  'consejeros escucharon aquello quedaron estremecidos dijeron dios prohibido padres casen hijas')]

In [37]:
def trans(x):  
    try:
        blob= TextBlob(x)
        return str(blob.translate(from_lang='es', to= 'en' ))
    except NotTranslated:
        return ''+x
    else:
        return ''+x

In [39]:
traduccion=rdd_stopwords.map(lambda x: trans(x[2])).collect()

                                                                                

### Merge 

In [64]:
final = rdd_stopwords.toDF()

In [66]:
traduccionDF= spark.createDataFrame([(l,) for l in traduccion], ["Traduccion"])

In [71]:
final = final.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))
traduccionDF = traduccionDF.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))

In [79]:
final_df = final.join(traduccionDF, final.row_idx == traduccionDF.row_idx).\
             drop("row_idx")

### CSV file

In [109]:
final_df.toPandas().to_csv('filesT.csv',index= False )

22/09/23 15:37:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/23 15:37:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/23 15:37:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/23 15:37:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.




22/09/23 15:37:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/23 15:37:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/23 15:37:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/23 15:37:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

22/09/23 15:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/23 15:37:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.




22/09/23 15:37:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/23 15:37:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

# Sentiment Analysis

In [110]:
rdd_data= spark.sparkContext.textFile("filesT.csv")

In [111]:
rdd_data=rdd_data.map(lambda x: x.split(","))

In [113]:
header = rdd_data.first()
rdd_data=rdd_data.filter(lambda line: line != header)

In [114]:
rdd_data.take(2)

[['audios/4da6b70e-0108-4f75-80ae-3d71f1dd2c2b.wav',
  '219064',
  'aquí dos palotadas encontrado robustas columnas apoyar grandiosa fábrica alcurnia',
  'Here two palotadas found robust columns support great Alcurnia factory'],
 ['audios/8c2ab30b-0fd4-41c3-9724-3b15f2ee2c27.wav',
  '271910',
  'consejeros escucharon aquello quedaron estremecidos dijeron dios prohibido padres casen hijas',
  'counselors heard that were shuddered God forbidden parents marry daughters']]

In [102]:
blob= TextBlob("support great Alcurnia")

In [137]:
type(blob.sentiment[1])

float

In [142]:
def sentiment(x):
    blob= TextBlob(x)
    return blob.polarity

In [126]:
test= rdd_data.filter(lambda x: int(x[1]) <40000)

In [127]:
test.count()

790

In [153]:
df_sentiment =rdd_data.map(lambda x: (x[0],x[1],x[3], sentiment(x[3]))).toDF()

In [146]:
pyspark_log = logging.getLogger('pyspark').setLevel(logging.ERROR)
py4j_logger = logging.getLogger("py4j").setLevel(logging.ERROR)
matplotlib_logger = logging.getLogger("matplotlib").setLevel(logging.ERROR)

In [154]:
df_sentiment.show()

[Stage 102:>                                                        (0 + 1) / 1]

+--------------------+------+--------------------+--------------------+
|                  _1|    _2|                  _3|                  _4|
+--------------------+------+--------------------+--------------------+
|audios/4da6b70e-0...|219064|Here two palotada...|                 0.8|
|audios/8c2ab30b-0...|271910|counselors heard ...|                 0.0|
|audios/ca73c951-c...| 64520|    Woman face hands|                 0.0|
|audios/067c4606-7...| 84222|They could pour b...|                 0.0|
|audios/49a08f90-3...| 77316|Hear steps raised...|                 0.0|
|audios/f97f3d33-a...|223566|Bonkey Bonfire So...|-0.13333333333333333|
|audios/32edabbd-6...|100054|   hoping to be hung|                 0.0|
|audios/35ef4edf-e...|116342|silver click to s...|                 0.0|
|audios/aeade7e3-2...| 61944|     I come thinking|                 0.0|
|audios/76187caa-b...|115422|prolonged conics ...|                 0.0|
|audios/cf271870-3...|117968|Mala Lucky Winned...|  0.3333333333

                                                                                

In [155]:
df_sentiment.toPandas().to_csv('filesS.csv', index= False)

                                                                                